From a62cfc3ccf2d096b28e041f2d8bbbe25f68ad00f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=BB=8B=E6=B6=B5=20Zephyr=20Wang?= Date: Sat, 29 May 2021 18:10:00 +0800 Subject: [PATCH 001/191] Fix typo in common.h --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index ac795937c..d27fc4cb7 100644 --- a/common.h +++ b/common.h @@ -392,7 +392,7 @@ typedef int blasint; #endif /*** -To alloc job_t on heap or statck. +To alloc job_t on heap or stack. please https://github.com/xianyi/OpenBLAS/issues/246 ***/ #if defined(OS_WINDOWS) From 448fe1c42a928f2a090f7ec1a5be9b6e81ccbf0f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 24 Aug 2022 19:24:01 +0800 Subject: [PATCH 002/191] Test on PolarFire Soc. `make NOFORTRAN=1 CC=gcc` --- TargetList.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index deef75819..99d603d03 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -118,7 +118,7 @@ Z13 Z14 10.RISC-V 64: -RISCV64_GENERIC +RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) C910V 11.LOONGARCH64: From bef47917bd72f35c151038fee0cf485445476863 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 00:06:25 -0800 Subject: [PATCH 003/191] Initial version for riscv sifive x280 --- Makefile.install | 7 + Makefile.prebuild | 8 + Makefile.riscv64 | 8 + README.md | 5 + TargetList.txt | 1 + benchmark/Makefile | 6 + common_riscv64.h | 4 + cpuid_riscv64.c | 2 + getarch.c | 12 + kernel/riscv64/KERNEL.x280 | 267 ++++++++ kernel/riscv64/amax_rvv.c | 102 +++ kernel/riscv64/amin_rvv.c | 102 +++ kernel/riscv64/asum_rvv.c | 99 +++ kernel/riscv64/axpby_rvv.c | 171 +++++ kernel/riscv64/axpy_rvv.c | 109 +++ kernel/riscv64/copy_rvv.c | 94 +++ kernel/riscv64/dot_rvv.c | 126 ++++ kernel/riscv64/gemm_beta_rvv.c | 89 +++ kernel/riscv64/gemm_ncopy_2_rvv.c | 92 +++ kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ++++ kernel/riscv64/gemm_ncopy_8_rvv.c | 164 +++++ kernel/riscv64/gemm_ncopy_rvv_v1.c | 76 +++ kernel/riscv64/gemm_tcopy_2_rvv.c | 108 +++ kernel/riscv64/gemm_tcopy_4_rvv.c | 236 +++++++ kernel/riscv64/gemm_tcopy_8_rvv.c | 264 ++++++++ kernel/riscv64/gemm_tcopy_rvv_v1.c | 74 +++ kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ++++++ kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ++++++++++++++ kernel/riscv64/gemmkernel_rvv_v1x8.c | 601 +++++++++++++++++ kernel/riscv64/gemv_n_rvv.c | 94 +++ kernel/riscv64/gemv_t_rvv.c | 119 ++++ kernel/riscv64/iamax_rvv.c | 150 +++++ kernel/riscv64/iamin_rvv.c | 151 +++++ kernel/riscv64/imax_rvv.c | 147 +++++ kernel/riscv64/imin_rvv.c | 147 +++++ kernel/riscv64/izamax_rvv.c | 162 +++++ kernel/riscv64/izamin_rvv.c | 161 +++++ kernel/riscv64/max_rvv.c | 98 +++ kernel/riscv64/min_rvv.c | 98 +++ kernel/riscv64/nrm2_rvv.c | 117 ++++ kernel/riscv64/rot_rvv.c | 149 +++++ kernel/riscv64/scal_rvv.c | 80 +++ kernel/riscv64/sum_rvv.c | 95 +++ kernel/riscv64/swap_rvv.c | 142 ++++ kernel/riscv64/symm_lcopy_rvv_v1.c | 101 +++ kernel/riscv64/symm_ucopy_rvv_v1.c | 100 +++ kernel/riscv64/symv_L_rvv.c | 224 +++++++ kernel/riscv64/symv_U_rvv.c | 221 +++++++ kernel/riscv64/trmm_lncopy_rvv_v1.c | 138 ++++ kernel/riscv64/trmm_ltcopy_rvv_v1.c | 134 ++++ kernel/riscv64/trmm_uncopy_rvv_v1.c | 136 ++++ kernel/riscv64/trmm_utcopy_rvv_v1.c | 133 ++++ kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ++++++++++ kernel/riscv64/trmmkernel_4x4_rvv.c | 881 +++++++++++++++++++++++++ kernel/riscv64/trmmkernel_rvv_v1x8.c | 685 +++++++++++++++++++ kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 847 ++++++++++++++++++++++++ kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 840 +++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 792 ++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 828 +++++++++++++++++++++++ kernel/riscv64/trsm_lncopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_ltcopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_uncopy_rvv_v1.c | 121 ++++ kernel/riscv64/trsm_utcopy_rvv_v1.c | 123 ++++ kernel/riscv64/zamax_rvv.c | 113 ++++ kernel/riscv64/zamin_rvv.c | 112 ++++ kernel/riscv64/zasum_rvv.c | 108 +++ kernel/riscv64/zaxpby_rvv.c | 151 +++++ kernel/riscv64/zaxpy_rvv.c | 154 +++++ kernel/riscv64/zcopy_rvv.c | 105 +++ kernel/riscv64/zdot_rvv.c | 170 +++++ kernel/riscv64/zgemm_beta_rvv.c | 117 ++++ kernel/riscv64/zgemv_n_rvv.c | 170 +++++ kernel/riscv64/zgemv_t_rvv.c | 172 +++++ kernel/riscv64/znrm2_rvv.c | 122 ++++ kernel/riscv64/zrot_rvv.c | 181 +++++ kernel/riscv64/zscal_rvv.c | 148 +++++ kernel/riscv64/zsum_rvv.c | 97 +++ kernel/riscv64/zswap_rvv.c | 156 +++++ kernel/riscv64/ztrmmkernel_2x2_rvv.c | 596 +++++++++++++++++ param.h | 44 ++ 80 files changed, 15188 insertions(+) create mode 100644 kernel/riscv64/KERNEL.x280 create mode 100644 kernel/riscv64/amax_rvv.c create mode 100644 kernel/riscv64/amin_rvv.c create mode 100644 kernel/riscv64/asum_rvv.c create mode 100644 kernel/riscv64/axpby_rvv.c create mode 100644 kernel/riscv64/axpy_rvv.c create mode 100644 kernel/riscv64/copy_rvv.c create mode 100644 kernel/riscv64/dot_rvv.c create mode 100644 kernel/riscv64/gemm_beta_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_rvv_v1.c create mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_rvv_v1.c create mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/gemmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/gemv_n_rvv.c create mode 100644 kernel/riscv64/gemv_t_rvv.c create mode 100644 kernel/riscv64/iamax_rvv.c create mode 100644 kernel/riscv64/iamin_rvv.c create mode 100644 kernel/riscv64/imax_rvv.c create mode 100644 kernel/riscv64/imin_rvv.c create mode 100644 kernel/riscv64/izamax_rvv.c create mode 100644 kernel/riscv64/izamin_rvv.c create mode 100644 kernel/riscv64/max_rvv.c create mode 100644 kernel/riscv64/min_rvv.c create mode 100644 kernel/riscv64/nrm2_rvv.c create mode 100644 kernel/riscv64/rot_rvv.c create mode 100644 kernel/riscv64/scal_rvv.c create mode 100644 kernel/riscv64/sum_rvv.c create mode 100644 kernel/riscv64/swap_rvv.c create mode 100644 kernel/riscv64/symm_lcopy_rvv_v1.c create mode 100644 kernel/riscv64/symm_ucopy_rvv_v1.c create mode 100644 kernel/riscv64/symv_L_rvv.c create mode 100644 kernel/riscv64/symv_U_rvv.c create mode 100644 kernel/riscv64/trmm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/trmmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/trsm_kernel_LN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_LT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/zamax_rvv.c create mode 100644 kernel/riscv64/zamin_rvv.c create mode 100644 kernel/riscv64/zasum_rvv.c create mode 100644 kernel/riscv64/zaxpby_rvv.c create mode 100644 kernel/riscv64/zaxpy_rvv.c create mode 100644 kernel/riscv64/zcopy_rvv.c create mode 100644 kernel/riscv64/zdot_rvv.c create mode 100644 kernel/riscv64/zgemm_beta_rvv.c create mode 100644 kernel/riscv64/zgemv_n_rvv.c create mode 100644 kernel/riscv64/zgemv_t_rvv.c create mode 100644 kernel/riscv64/znrm2_rvv.c create mode 100644 kernel/riscv64/zrot_rvv.c create mode 100644 kernel/riscv64/zscal_rvv.c create mode 100644 kernel/riscv64/zsum_rvv.c create mode 100644 kernel/riscv64/zswap_rvv.c create mode 100644 kernel/riscv64/ztrmmkernel_2x2_rvv.c diff --git a/Makefile.install b/Makefile.install index 87b5bc870..f1adaa271 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,6 +8,7 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin +OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -38,6 +39,7 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -202,3 +204,8 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! +#Generating release tar + @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz + @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . + + diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f1274..e6a8eab59 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), x280) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +endif + +ifeq ($(TARGET), RISCV64_GENERIC) +TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce91e03ec..d6eaf552d 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -2,3 +2,11 @@ ifeq ($(CORE), C910V) CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif +ifeq ($(CORE), x280) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +endif +ifeq ($(CORE), RISCV64_GENERIC) +CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +endif \ No newline at end of file diff --git a/README.md b/README.md index 6ce85e08e..6ecb46178 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ``` (also known to work on C906) +- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. + ```sh + make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/TargetList.txt b/TargetList.txt index deef75819..6c533361e 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -120,6 +120,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC C910V +x280 11.LOONGARCH64: LOONGSONGENERIC diff --git a/benchmark/Makefile b/benchmark/Makefile index f2f3b354a..734c83a26 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +# x280 temporary workaround for gfortran +ifeq ($(TARGET), x280) +CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) +endif + + ifneq ($(NO_LAPACK), 1) GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ diff --git a/common_riscv64.h b/common_riscv64.h index 7ddbe80a4..221a79901 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,6 +92,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) +#include +#endif + +#if defined(x280) #include #endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873..5326787e6 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 +#define CPU_x280 2 static char *cpuname[] = { "RISCV64_GENERIC", "C910V" + "x280" }; int detect(void){ diff --git a/getarch.c b/getarch.c index cde5b4e83..0d197285c 100644 --- a/getarch.c +++ b/getarch.c @@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "c910v" #define CORENAME "C910V" #endif +#endif +#ifdef FORCE_x280 +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "x280" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-Dx280 " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "x280" +#define CORENAME "x280" #else #endif diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 new file mode 100644 index 000000000..2eb60f2b4 --- /dev/null +++ b/kernel/riscv64/KERNEL.x280 @@ -0,0 +1,267 @@ +# ********************************************************************************** +# Copyright (c) 2022, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +CTRMMKERNEL = ztrmmkernel_2x2_rvv.c +ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c + +# SGEMM_UNROLL_N set in params.h +ifeq ($(SGEMM_UNROLL_N), 2) +SGEMMKERNEL = gemmkernel_2x2_rvv.c +SGEMMONCOPY = gemm_ncopy_2_rvv.c +SGEMMOTCOPY = gemm_tcopy_2_rvv.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMKERNEL = gemmkernel_4x4_rvv.c +SGEMMONCOPY = gemm_ncopy_4_rvv.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +SGEMMKERNEL = gemmkernel_rvv_v1x8.c +SGEMMINCOPY = gemm_ncopy_rvv_v1.c +SGEMMITCOPY = gemm_tcopy_rvv_v1.c +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMKERNEL = trmmkernel_rvv_v1x8.c + +STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +SSYMMUCOPY_M = symm_ucopy_rvv_v1.c +SSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +# SGEMM_UNROLL_N set in params.h +ifeq ($(DGEMM_UNROLL_N), 2) +DGEMMKERNEL = gemmkernel_2x2_rvv.c +DGEMMONCOPY = gemm_ncopy_2_rvv.c +DGEMMOTCOPY = gemm_tcopy_2_rvv.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMKERNEL = gemmkernel_4x4_rvv.c +DGEMMONCOPY = gemm_ncopy_4_rvv.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +DGEMMKERNEL = gemmkernel_rvv_v1x8.c +DGEMMINCOPY = gemm_ncopy_rvv_v1.c +DGEMMITCOPY = gemm_tcopy_rvv_v1.c +DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = trmmkernel_rvv_v1x8.c +DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +DSYMMUCOPY_M = symm_ucopy_rvv_v1.c +DSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c new file mode 100644 index 000000000..c9c6e7f73 --- /dev/null +++ b/kernel/riscv64/amax_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c new file mode 100644 index 000000000..370b6c338 --- /dev/null +++ b/kernel/riscv64/amin_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c new file mode 100644 index 000000000..4f711c9be --- /dev/null +++ b/kernel/riscv64/asum_rvv.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + asumf = VFMVFS_FLOAT_M1(v_res); + return(asumf); +} diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c new file mode 100644 index 000000000..7c35c563d --- /dev/null +++ b/kernel/riscv64/axpby_rvv.c @@ -0,0 +1,171 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + FLOAT_V_T vx, vy; + + if ( n < 0 ) return(0); + + if ( beta == 0.0 ) { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + memset(&y[0], 0, n * sizeof(FLOAT)); + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + size_t vl = VSETVL(n); + vy = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) { + vl = VSETVL(n); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + + } else { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + } + + return(0); +} diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c new file mode 100644 index 000000000..3986f4e21 --- /dev/null +++ b/kernel/riscv64/axpy_rvv.c @@ -0,0 +1,109 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( n <= 0 ) return(0); + if ( da == 0.0 ) return(0); + + FLOAT_V_T vx, vy; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT(y, vy, vl); + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c new file mode 100644 index 000000000..5d5a8bd04 --- /dev/null +++ b/kernel/riscv64/copy_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1 && inc_y == 1) { + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if(inc_x == 1) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c new file mode 100644 index 000000000..60dcac2f5 --- /dev/null +++ b/kernel/riscv64/dot_rvv.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + double dot = 0.0; + + if ( n <= 0 ) return(dot); + + size_t vlmax = vsetvlmax_e64m8(); + vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } + + vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); + dot = vfmv_f_s_f64m1_f64(vec_sum); + + return(dot); +} diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c new file mode 100644 index 000000000..34d1ea078 --- /dev/null +++ b/kernel/riscv64/gemm_beta_rvv.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#endif + +// Optimizes the implementation in ../generic/gemm_beta.c + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vx; + + if (beta == ZERO) { + + vl = VSETVL(m); + vx = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + vx = VLEV_FLOAT(c_offset, vl); + vx = VFMULVF_FLOAT(vx, beta, vl); + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c new file mode 100644 index 000000000..5f55bc349 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_2_rvv.c @@ -0,0 +1,92 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEG2_FLOAT vsseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEG2_FLOAT vsseg2e64_v_f64m4 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset; + FLOAT_V_T v1, v2; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + + for(j = (n >> 1); j > 0; j--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset, vl); + VSEV_FLOAT(b_offset, v1, vl); + + a_offset += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c new file mode 100644 index 000000000..4d4efe4c9 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_4_rvv.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG4_FLOAT vsseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG4_FLOAT vsseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c new file mode 100644 index 000000000..525b223c2 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_8.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 3); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..2c5230752 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -0,0 +1,76 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda; + + for(i = m; i > 0; i--) { + v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(b_offset, v0, vl); + + a_offset1++; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c new file mode 100644 index 000000000..963e1be69 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_2_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT_V_T v1a, v1b, v2a, v2b; + size_t vl; + + //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + for(i = (m >> 1); i > 0; i--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); + VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); + + VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); + + a_offset1 += vl * 2; + a_offset2 += vl * 2; + b_offset1 += vl * m * 2; + } + + if (n & 1) { + *(b_offset2 + 0) = *(a_offset1 + 0); + *(b_offset2 + 1) = *(a_offset2 + 0); + b_offset2 += 2; + } + } + + if (m & 1) { + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); + + VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); + + a_offset += vl * 2; + b_offset += vl * m * 2; + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c new file mode 100644 index 000000000..ac9974b24 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_4_rvv.c @@ -0,0 +1,236 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + for(i = (n >> 2); i > 0; i--) { + v1 = VLEV_FLOAT(a_offset1, 4); + v2 = VLEV_FLOAT(a_offset2, 4); + v3 = VLEV_FLOAT(a_offset3, 4); + v4 = VLEV_FLOAT(a_offset4, 4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + VSEV_FLOAT(b_offset1, v1, 4); + VSEV_FLOAT(b_offset2+4, v2, 4); + VSEV_FLOAT(b_offset2+8, v3, 4); + VSEV_FLOAT(b_offset2+12, v4, 4); + + b_offset1 += m * 4; + } + + if (n & 2) { + v1 = VLEV_FLOAT(a_offset1, 2); + v2 = VLEV_FLOAT(a_offset2, 2); + v3 = VLEV_FLOAT(a_offset3, 2); + v4 = VLEV_FLOAT(a_offset4, 2); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + VSEV_FLOAT(b_offset2, v1, 2); + VSEV_FLOAT(b_offset2+2, v2, 2); + VSEV_FLOAT(b_offset2+4, v3, 2); + VSEV_FLOAT(b_offset2+6, v4, 2); + + b_offset2 += 8; + } + + if (n & 1) { + v1 = VLEV_FLOAT(a_offset1, 1); + v2 = VLEV_FLOAT(a_offset2, 1); + v3 = VLEV_FLOAT(a_offset3, 1); + v4 = VLEV_FLOAT(a_offset4, 1); + + VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); + + b_offset3 += 4; + } + + } + +// TODO cleanup + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c new file mode 100644 index 000000000..81c1f962b --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -0,0 +1,264 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEV_FLOAT vlse32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEV_FLOAT vlse64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + + // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + for(j = (m >> 3); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 8; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 32; + } + + if (n & 2) { + size_t vl = 8; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 16; + } + + if (n & 1) { + size_t vl = 8; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 8; + } + + } + + if (m & 4) { + + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 4; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 4; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 2) { + size_t vl = 4; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 8; + } + + if (n & 1) { + size_t vl = 4; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 4; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 2; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 2; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 2) { + size_t vl = 2; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 4; + } + + if (n & 1) { + size_t vl = 2; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 2; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 4) { + size_t vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 2) { + size_t vl = 2; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset3, v0, vl); + + aoffset1 += 2; + // boffset3 += 2; + } + + if (n & 1) { + *(boffset4) = *(aoffset1); + // aoffset1 ++; + // boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..a291b70b8 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl; + + for(i = m; i > 0; i--) { + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset, v0, vl); + + aoffset1 += lda; + boffset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c new file mode 100644 index 000000000..ec8961ced --- /dev/null +++ b/kernel/riscv64/gemmkernel_2x2_rvv.c @@ -0,0 +1,214 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG2_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG2_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/2; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c new file mode 100644 index 000000000..aa58bcc76 --- /dev/null +++ b/kernel/riscv64/gemmkernel_4x4_rvv.c @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEG2_FLOAT vlseg2e32_v_f32m1 +#define VLSEG4_FLOAT vlseg4e32_v_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m1 +#define VFMACCVF_FLOAT vfmacc_vf_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEG2_FLOAT vlseg2e64_v_f64m1 +#define VLSEG4_FLOAT vlseg4e64_v_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m1 +#define VFMACCVF_FLOAT vfmacc_vf_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU + + FLOAT_V_T va0, va1, va2, va3; + FLOAT_V_T vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/4; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + vres8 = VFMVVF_FLOAT(0.0, vlmax); + vres9 = VFMVVF_FLOAT(0.0, vlmax); + vres10 = VFMVVF_FLOAT(0.0, vlmax); + vres11 = VFMVVF_FLOAT(0.0, vlmax); + vres12 = VFMVVF_FLOAT(0.0, vlmax); + vres13 = VFMVVF_FLOAT(0.0, vlmax); + vres14 = VFMVVF_FLOAT(0.0, vlmax); + vres15 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); + vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); + vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); + vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); + + vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); + vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); + vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); + vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); + + ptrba += vl*4; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + C2 += 4; + C3 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + ptrba += vl*2; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + C1 += 2; + C2 += 2; + C3 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 1; + C1 += 1; + C2 += 1; + C3 += 1; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + if(bn & 2) { + + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); + vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); + + ptrba += vl*4; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + ptrba += vl*4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c new file mode 100644 index 000000000..5cd509f93 --- /dev/null +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -0,0 +1,601 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#endif + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + // N:8 + for (j = bn/8; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + C4 = C3 + ldc; + C5 = C4 + ldc; + C6 = C5 + ldc; + C7 = C6 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrba += vl; + ptrbb += 8; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VLEV_FLOAT(C4, vl); + va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VLEV_FLOAT(C5, vl); + va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VLEV_FLOAT(C6, vl); + va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VLEV_FLOAT(C7, vl); + va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); + VSEV_FLOAT(C7, va7, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + + bb += (bk<<3); + C += (ldc<<3); + } + + // N:4 + if (bn & 4) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrba += vl; + ptrbb += 4; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + // N:2 + if (bn & 2) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrba += vl; + ptrbb += 2; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + C0 += vl; + C1 += vl; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + // N:1 + if (bn & 1) { + C0 = C; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrba += vl; + ptrbb += 1; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + C0 += vl; + } + + bb += (bk); + C += (ldc); + } + + return 0; +} diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c new file mode 100644 index 000000000..9d2dee615 --- /dev/null +++ b/kernel/riscv64/gemv_n_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + if(n < 0) return(0); + + FLOAT *a_ptr, *x_ptr; + BLASLONG i; + FLOAT_V_T va, vy; + + if(inc_y == 1) { + + for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLEV_FLOAT(y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSEV_FLOAT(y, vy, vl); + } + + } else { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLSEV_FLOAT(y, stride_y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + return(0); +} diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c new file mode 100644 index 000000000..a80af81b6 --- /dev/null +++ b/kernel/riscv64/gemv_t_rvv.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT *a_ptr, *x_ptr; + + FLOAT_V_T va, vx, vr; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + if(inc_x == 1) { + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLEV_FLOAT(x_ptr, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLSEV_FLOAT(x_ptr, stride_x, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } + + return(0); +} diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c new file mode 100644 index 000000000..8b33b3bcb --- /dev/null +++ b/kernel/riscv64/iamax_rvv.c @@ -0,0 +1,150 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c new file mode 100644 index 000000000..585b37186 --- /dev/null +++ b/kernel/riscv64/iamin_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c new file mode 100644 index 000000000..d84ad968e --- /dev/null +++ b/kernel/riscv64/imax_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_min; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c new file mode 100644 index 000000000..fb734f6f8 --- /dev/null +++ b/kernel/riscv64/imin_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c new file mode 100644 index 000000000..9cb332cbb --- /dev/null +++ b/kernel/riscv64/izamax_rvv.c @@ -0,0 +1,162 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + } + else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + + } + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c new file mode 100644 index 000000000..69771e5aa --- /dev/null +++ b/kernel/riscv64/izamin_rvv.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c new file mode 100644 index 000000000..5b1380d2b --- /dev/null +++ b/kernel/riscv64/max_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c new file mode 100644 index 000000000..bddcc0ba7 --- /dev/null +++ b/kernel/riscv64/min_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c new file mode 100644 index 000000000..3f5d50397 --- /dev/null +++ b/kernel/riscv64/nrm2_rvv.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define ABS fabsf +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define ABS fabs +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if( n <= 0 ) return(0.0); + if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl * inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c new file mode 100644 index 000000000..7bf5e4270 --- /dev/null +++ b/kernel/riscv64/rot_rvv.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if(n <= 0) return(0); + + FLOAT_V_T v0, v1, vx, vy; + + if (inc_x == 0 || inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_y == 1) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_x == 1) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c new file mode 100644 index 000000000..d2c0378bf --- /dev/null +++ b/kernel/riscv64/scal_rvv.c @@ -0,0 +1,80 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( (n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + + } + + return 0; +} diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c new file mode 100644 index 000000000..1db0d09dd --- /dev/null +++ b/kernel/riscv64/sum_rvv.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + sumf = VFMVFS_FLOAT_M1(v_res); + return(sumf); +} diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c new file mode 100644 index 000000000..2cf92f6ad --- /dev/null +++ b/kernel/riscv64/swap_rvv.c @@ -0,0 +1,142 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG stride_x, stride_y; + FLOAT_V_T vx, vy; + + if (n <= 0) return(0); + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp = x[0]; + x[0] = y[0]; + y[0] = temp; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp = x[0]; + x[0] = y[(n - 1) * inc_y]; + FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one + stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) { + vl = VSETVL(m); + vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); + VSSEV_FLOAT(ptr, stride_y, vy, vl); + } + y[0] = temp; + } + else if(inc_y == 0) { + FLOAT temp = y[0]; + y[0] = x[(n - 1) * inc_x]; + FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one + stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { + vl = VSETVL(m); + vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); + VSSEV_FLOAT(ptr, stride_x, vx, vl); + } + x[0] = temp; + } + else if(inc_x == 1 && inc_y == 1) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else if (inc_y == 1) { + stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + + } else if(inc_x == 1) { + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c new file mode 100644 index 000000000..f0def9617 --- /dev/null +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -0,0 +1,101 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_lcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX + posY * lda; + ao2 = a + posY + (posX) * lda; + + for (i = m; i > 0; i--, offset--) { + va2 = VLSEV_FLOAT(ao2, stride_lda, vl); + va1 = VLEV_FLOAT(ao1, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1 += lda; + ao2++; + } + } + + return 0; +} + diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c new file mode 100644 index 000000000..958506df3 --- /dev/null +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_ucopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posX + 0 + posY * lda; + + for (i = m; i > 0; i--, offset--) { + va1 = VLSEV_FLOAT(ao1, stride_lda, vl); + va2 = VLEV_FLOAT(ao2, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1++; + ao2 += lda; + } + } + + return 0; +} diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c new file mode 100644 index 000000000..737abaae3 --- /dev/null +++ b/kernel/riscv64/symv_L_rvv.c @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + if(inc_x == 1 && inc_y == 1) + { + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jy += inc_y; + a_ptr += lda; + } + } + else if(inc_y == 1) + { + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + inc_xv = inc_x * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + a_ptr += lda; + } + } + else + { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c new file mode 100644 index 000000000..cb923be5d --- /dev/null +++ b/kernel/riscv64/symv_U_rvv.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + FLOAT_V_T_M1 v_res, v_z0; + size_t vl_max = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vl_max); + v_z0 = VFMVVF_FLOAT_M1(0, vl_max); + vl_max = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + if(inc_x == 1 && inc_y == 1) + { + a_ptr += m1 * lda; + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jy += inc_y; + } + } + else if(inc_y == 1) + { + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + vr = VFMVVF_FLOAT(0, vl); + + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + } + } + else + { + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c new file mode 100644 index 000000000..73a8233f8 --- /dev/null +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT vid_v_i32m2 +#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..2fe8cf79e --- /dev/null +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..b64cd840d --- /dev/null +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda; + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X > posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..b96daae5b --- /dev/null +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -0,0 +1,133 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T vb, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + ao ++; + b += vl; + X ++; + i++; + } + else if (X > posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + ao += lda; + b += vl; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c new file mode 100644 index 000000000..127e76970 --- /dev/null +++ b/kernel/riscv64/trmmkernel_2x2_rvv.c @@ -0,0 +1,342 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_2x2.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 v_res, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); + C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+2; + C1 = C1+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + C1 = C1+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off; +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + vb0 = VLEV_FLOAT(ptrbb, vl); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); + + ptrba += vl * 2; + ptrbb += vl; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp; +#endif +#ifdef LEFT + off += 2; +#endif + + C0 = C0+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + ptrba += vl; + ptrbb += vl; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp; + ptrbb += temp; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c new file mode 100644 index 000000000..3e46c6348 --- /dev/null +++ b/kernel/riscv64/trmmkernel_4x4_rvv.c @@ -0,0 +1,881 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMUL_FLOAT vfmul_vv_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMUL_FLOAT vfmul_vv_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_4x4.c + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + + FLOAT_V_T vres0_0; + FLOAT_V_T vres0_1; + FLOAT_V_T vres0_2; + FLOAT_V_T vres0_3; + + FLOAT_V_T vres1_0; + FLOAT_V_T vres1_1; + FLOAT_V_T vres1_2; + FLOAT_V_T vres1_3; + + FLOAT_V_T vres2_0; + FLOAT_V_T vres2_1; + FLOAT_V_T vres2_2; + FLOAT_V_T vres2_3; + + FLOAT_V_T vres3_0; + FLOAT_V_T vres3_1; + FLOAT_V_T vres3_2; + FLOAT_V_T vres3_3; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); + vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); + vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); + + ptrba += vl * 4; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + if (!backwards) { + temp = bk-off; + temp = left ? temp - 4 : // number of values in A + temp - 4; // number of values in B + + ptrba += temp*4; // number of values in A + ptrbb += temp*4; // number of values in B + } +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + C2 = C2+4; + C3 = C3+4; + + } + + if ( bm & 2 ) // do any 2x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres2_1 = VFMVVF_FLOAT(0, vlmax); + + vres3_0 = VFMVVF_FLOAT(0, vlmax); + vres3_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+4; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + ptrba += vl * 2; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); + + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + C2 = C2+2; + C3 = C3+2; + + } + + if ( bm & 1 ) // do any 1x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres3_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + C2 = C2+1; + C3 = C3+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + for (j=0; j<(bn&2); j+=2) // do the Mx2 loops + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + + ptrba += vl * 4; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + + } + + if ( bm & 2 ) // do any 2x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + + } + + if ( bm & 1 ) // do any 1x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*2; +#endif + + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + for (j=0; j<(bn&1); j+=1) // do the Mx1 loops + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + + ptrba += vl * 4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + + } + + if ( bm & 2 ) // do any 2x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + ptrba += vl * 2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + + } + + if ( bm & 1 ) // do any 1x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c new file mode 100644 index 000000000..97b14650c --- /dev/null +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -0,0 +1,685 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_8x8.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc); + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + for (j = bn/8; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + C4 = C3+ldc; + C5 = C4+ldc; + C6 = C5+ldc; + C7 = C6+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*8; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+8; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VFMULVF_FLOAT(vres4, alpha, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VFMULVF_FLOAT(vres5, alpha, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VFMULVF_FLOAT(vres6, alpha, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VFMULVF_FLOAT(vres7, alpha, vl); + VSEV_FLOAT(C7, va7, vl); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 8; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*8; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; +#endif + + bb += (bk<<3); + C += (ldc<<3); + } + + if (bn & 4) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*4; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk<<2); + C += (ldc<<2); + } + + if (bn & 2) + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + bb += (bk<<1); + C += (ldc<<1); + } + + if (bn & 1) + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*1; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + bb += (bk); + C += (ldc); + } + return 0; +} + diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c new file mode 100644 index 000000000..11a0398ca --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -0,0 +1,847 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + pa = a; + pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc0, vc1; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + //bb = *(c + i + j * ldc); + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + //bb *= aa; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + //*b = bb; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + + //*(c + i + j * ldc) = bb; + //b ++; + + b += 2; + //pa = a + i + 1; + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + pa = a; + //pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + + b += 8; + + pc0 = c + j * ldc * 8; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#else +static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * j * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c new file mode 100644 index 000000000..0380bd1bb --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c + +#ifndef COMPLEX +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc; + for (i = 0; i < m; i++) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b++; + pa = a + i + 1; + pc = c + j * ldc + i + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc0, vc1; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + b += 8; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + + +static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + FLOAT *pa, *pc; + + int i, j, k; + + size_t vl; + FLOAT_V_T va0, va1, vc0, vc1; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + pa = a + (i + 1) * 2; + pc = c + j * ldc + (i + 1) * 2; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, pa, vl); + VLSEG2_FLOAT(&vc0, &vc1, pc, vl); +#ifndef CONJ + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#else + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); + vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#endif + VSSEG2_FLOAT(pc, vc0, vc1, vl); + pa += vl * 2; + pc += vl * 2; + } + } + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c new file mode 100644 index 000000000..41368be60 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -0,0 +1,792 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + + for (j = 0; j < m; j ++) + { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b + i + 1; + pc = c + j + (i + 1) *ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc ++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + + a += 8; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + pc4 = pa4 + ldc; + pc5 = pa5 + ldc; + pc6 = pa6 + ldc; + pc7 = pa7 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} +#else +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = vl; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c new file mode 100644 index 000000000..459c1663a --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -0,0 +1,828 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b; + pc = c + j; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc++; + } + } + b -= n; + a -= 2 * m; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 2; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc + j * 2; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + a += 8; + + pb = b; + pc0 = c + j * 8; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + pc4 = pc3 + 1; + pc5 = pc4 + 1; + pc6 = pc5 + 1; + pc7 = pc6 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - j) * vl * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..bacfb2b08 --- /dev/null +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 + +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + *(b + j) = INV(*(ao + j * lda)); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..0fc7c9f24 --- /dev/null +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j)); + + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + b += vl; + ao += lda; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..ee869a795 --- /dev/null +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j * lda)); + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..a324b0fa6 --- /dev/null +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + *(b + j) = INV(*(ao + j)); + + ao += lda; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c new file mode 100644 index 000000000..1917042be --- /dev/null +++ b/kernel/riscv64/zamax_rvv.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T v0, v1, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c new file mode 100644 index 000000000..3f027383a --- /dev/null +++ b/kernel/riscv64/zamin_rvv.c @@ -0,0 +1,112 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T v0, v1, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c new file mode 100644 index 000000000..7876646b3 --- /dev/null +++ b/kernel/riscv64/zasum_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v1 = VLEV_FLOAT(x+vl, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + else { + + int stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v1 = VLSEV_FLOAT(x+1, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + asumf += VFMVFS_FLOAT_M1(v_res); + + return(asumf); +} diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c new file mode 100644 index 000000000..66f52d9d0 --- /dev/null +++ b/kernel/riscv64/zaxpby_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/*************************************************************************** +* 2014/06/07 Saar +* +***************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFMSACVF_FLOAT vfmsac_vf_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFMSACVF_FLOAT vfmsac_vf_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) +{ + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + BLASLONG stride_x = inc_x2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y2 * sizeof(FLOAT); + FLOAT_V_T vx0, vx1, vy0, vy1; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + size_t vl = VSETVL(n); + FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) + { + vl = VSETVL(n); + VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); + vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); + + vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); + + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + } + else + { + FLOAT_V_T v0, v1; + + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vy1, beta_i, vl); + v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); + + v1 = VFMULVF_FLOAT(vy1, beta_r, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); + v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); + v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); + v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); + + v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); + v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + } + return(0); + +} diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c new file mode 100644 index 000000000..777bcb728 --- /dev/null +++ b/kernel/riscv64/zaxpy_rvv.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if(n < 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1) { + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c new file mode 100644 index 000000000..5d8322bbb --- /dev/null +++ b/kernel/riscv64/zcopy_rvv.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_M8(n) vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 vle32_v_f32m8 +#define VSEV_FLOAT_M8 vse32_v_f32m8 + +#define VSETVL_M4(n) vsetvl_e32m4(n) +#define FLOAT_V_T_M4 vfloat32m4_t +#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 +#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 +#else +#define VSETVL_M8(n) vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 vle64_v_f64m8 +#define VSEV_FLOAT_M8 vse64_v_f64m8 + +#define VSETVL_M4(n) vsetvl_e64m4(n) +#define FLOAT_V_T_M4 vfloat64m4_t +#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 +#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + if(inc_x == 1 && inc_y == 1) { + + FLOAT_V_T_M8 vx; + n *= 2; // convert to words + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL_M8(n); + vx = VLEV_FLOAT_M8(x, vl); + VSEV_FLOAT_M8(y, vx, vl); + } + + }else if (1 == inc_x) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSEG_FLOAT_M4(&vr, &vi, x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } else if (1 == inc_y) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSEG_FLOAT_M4(y, vr, vi, vl); + } + } else { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c new file mode 100644 index 000000000..7eae6f608 --- /dev/null +++ b/kernel/riscv64/zdot_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMSACVV_FLOAT vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMSACVV_FLOAT vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + OPENBLAS_COMPLEX_FLOAT result; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + + if ( n <= 0 ) return(result); + + FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax_m1 = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); + + size_t vlmax = VSETVL_MAX; + vr0 = VFMVVF_FLOAT(0, vlmax); + vr1 = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + + } else if (inc_x == 1){ + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } else if (inc_y == 1){ + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + }else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } + + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); + CREAL(result) = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); + CIMAG(result) = VFMVFS_FLOAT_M1(v_res); + + return(result); +} diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c new file mode 100644 index 000000000..a89752d18 --- /dev/null +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFSUBVV_FLOAT vfsub_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFSUBVV_FLOAT vfsub_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vr, vi, v1, v2, v3, v4; + + ldc *= 2; + c_offset = c; + + if (beta_r == 0.0 && beta_i == 0.0) { + + vl = VSETVL(m); + vr = VFMVVF_FLOAT(0.0, vl); + vi = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VLSEG_FLOAT(&vr, &vi, c_offset, vl); + + v1 = VFMULVF_FLOAT(vr, beta_r, vl); + v2 = VFMULVF_FLOAT(vi, beta_i, vl); + + v3 = VFMULVF_FLOAT(vi, beta_r, vl); + v4 = VFMULVF_FLOAT(vr, beta_i, vl); + + vr = VFSUBVV_FLOAT(v1, v2, vl); + vi = VFADDVV_FLOAT(v3, v4, vl); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c new file mode 100644 index 000000000..2eeb61b45 --- /dev/null +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix; + FLOAT *a_ptr; + FLOAT temp_r, temp_i; + FLOAT_V_T va0, va1, vy0, vy1; + + BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; + + BLASLONG inc_x2 = inc_x * 2; + BLASLONG lda2 = lda * 2; + if (inc_y == 1) + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } + else + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + return(0); +} diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c new file mode 100644 index 000000000..b682d5cd8 --- /dev/null +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; + FLOAT temp_r, temp_i; + + FLOAT_V_T va0, va1, vx0, vx1, vr, vi; + FLOAT_V_T_M1 v_res, v_z0; + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + //BLASLONG stride_a = sizeof(FLOAT) * 2; + BLASLONG inc_y2 = inc_y * 2; + BLASLONG lda2 = lda * 2; + + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL(m); + + if (inc_x == 1) + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + } + else + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + + } + + + return(0); +} diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c new file mode 100644 index 000000000..921ddb8cb --- /dev/null +++ b/kernel/riscv64/znrm2_rvv.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m4 +#endif + +// TODO: Should single precision use the widening MAC, or perhaps all should be double? + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if ( n <= 0 ) return(0.0); + + FLOAT_V_T vr, v0, v1; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if (inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c new file mode 100644 index 000000000..68066a00b --- /dev/null +++ b/kernel/riscv64/zrot_rvv.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + + return 0; +} diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c new file mode 100644 index 000000000..079c36a2d --- /dev/null +++ b/kernel/riscv64/zscal_rvv.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if((n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T vt, vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + size_t vlmax = VSETVL_MAX; + + if(da_r == 0.0 && da_i == 0.0) { + + vr = VFMVVF_FLOAT(0.0, vlmax); + vi = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VSSEG_FLOAT(x, vr, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + } + + } else if(da_r == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vi, -da_i, vl); + vi = VFMULVF_FLOAT(vr, da_i, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + + } else if(da_i == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vr = VFMULVF_FLOAT(vr, da_r, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + + } else { + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vr, &vi, x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSEG_FLOAT(x, vt, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + } + } + + return(0); +} diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c new file mode 100644 index 000000000..3928fbe27 --- /dev/null +++ b/kernel/riscv64/zsum_rvv.c @@ -0,0 +1,97 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + sumf += VFMVFS_FLOAT_M1(v_res); + + return(sumf); +} diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c new file mode 100644 index 000000000..86f9103d3 --- /dev/null +++ b/kernel/riscv64/zswap_rvv.c @@ -0,0 +1,156 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[0]; + x[1] = y[1]; + y[0] = temp[0]; + y[1] = temp[1]; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[(n - 1) * inc_y * 2]; + x[0] = y[(n - 1) * inc_y * 2 + 1]; + FLOAT* ptr = y + (n - 1) * inc_y * 2; // start from the last one + BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + } + y[0] = temp[0]; + y[1] = temp[1]; + } + else if(inc_y == 0) { + FLOAT temp[2]; + temp[0] = y[0]; + temp[1] = y[1]; + y[0] = x[(n - 1) * inc_x * 2]; + y[0] = x[(n - 1) * inc_x * 2 + 1]; + FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one + BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + } + x[0] = temp[0]; + x[1] = temp[1]; + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c new file mode 100644 index 000000000..3486a4648 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -0,0 +1,596 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c + + +/******************************** + ADD1 a*c + ADD2 b*c + ADD3 a*d + ADD4 b*d + *********************************/ +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, + FLOAT* C,BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T_M1 v_m1_res0, v_m1_res1; + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + C0 = C; + C1 = C0+2*ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 2; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 4; + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[2] = res0 * alphar - res1 * alphai; + C1[3] = res1 * alphar + res0 * alphai; +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + + ptrba += temp*2*2; + ptrbb += temp*2*2; + +#endif + +#ifdef LEFT + off += 2; +#endif + + C0 = C0+4; + C1 = C1+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 4; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + C1 = C1+2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 2; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 2; + + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp*2; + +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + } + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + return 0; +} diff --git a/param.h b/param.h index 514b13a3a..62b675d6c 100644 --- a/param.h +++ b/param.h @@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(x280) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2 +#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2 + +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If VLMAX size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8 +#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef C910V #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 From 5d0d1c555195a391fe5d029427dfbf7b942ecdf9 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 18:22:21 -0800 Subject: [PATCH 004/191] Remove redundant files --- Makefile.install | 5 - kernel/riscv64/KERNEL.x280 | 36 +- kernel/riscv64/gemm_ncopy_2_rvv.c | 92 --- kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ---- kernel/riscv64/gemm_tcopy_2_rvv.c | 108 ---- kernel/riscv64/gemm_tcopy_4_rvv.c | 236 -------- kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ------- kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ---------------- kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ----------- kernel/riscv64/trmmkernel_4x4_rvv.c | 881 ---------------------------- 10 files changed, 2 insertions(+), 2543 deletions(-) delete mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c diff --git a/Makefile.install b/Makefile.install index f1adaa271..168d08f72 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin -OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -39,7 +38,6 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" - @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -204,8 +202,5 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! -#Generating release tar - @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz - @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 2eb60f2b4..4d64354fb 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -122,23 +122,7 @@ CTRMMKERNEL = ztrmmkernel_2x2_rvv.c ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c # SGEMM_UNROLL_N set in params.h -ifeq ($(SGEMM_UNROLL_N), 2) -SGEMMKERNEL = gemmkernel_2x2_rvv.c -SGEMMONCOPY = gemm_ncopy_2_rvv.c -SGEMMOTCOPY = gemm_tcopy_2_rvv.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMKERNEL = gemmkernel_4x4_rvv.c -SGEMMONCOPY = gemm_ncopy_4_rvv.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 8) +ifeq ($(SGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX SGEMMKERNEL = gemmkernel_rvv_v1x8.c SGEMMINCOPY = gemm_ncopy_rvv_v1.c @@ -162,23 +146,7 @@ SSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif # SGEMM_UNROLL_N set in params.h -ifeq ($(DGEMM_UNROLL_N), 2) -DGEMMKERNEL = gemmkernel_2x2_rvv.c -DGEMMONCOPY = gemm_ncopy_2_rvv.c -DGEMMOTCOPY = gemm_tcopy_2_rvv.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMKERNEL = gemmkernel_4x4_rvv.c -DGEMMONCOPY = gemm_ncopy_4_rvv.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 8) +ifeq ($(DGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX DGEMMKERNEL = gemmkernel_rvv_v1x8.c DGEMMINCOPY = gemm_ncopy_rvv_v1.c diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c deleted file mode 100644 index 5f55bc349..000000000 --- a/kernel/riscv64/gemm_ncopy_2_rvv.c +++ /dev/null @@ -1,92 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEG2_FLOAT vsseg2e32_v_f32m4 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEG2_FLOAT vsseg2e64_v_f64m4 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - FLOAT_V_T v1, v2; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - - for(j = (n >> 1); j > 0; j--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset, vl); - VSEV_FLOAT(b_offset, v1, vl); - - a_offset += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c deleted file mode 100644 index 4d4efe4c9..000000000 --- a/kernel/riscv64/gemm_ncopy_4_rvv.c +++ /dev/null @@ -1,123 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG4_FLOAT vsseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG4_FLOAT vsseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset; - - FLOAT_V_T v1, v2, v3, v4; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - for(j = (n >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - v3 = VLEV_FLOAT(a_offset3, vl); - v4 = VLEV_FLOAT(a_offset4, vl); - - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); - - a_offset1 += vl; - a_offset2 += vl; - a_offset3 += vl; - a_offset4 += vl; - b_offset += vl*4; - } - } - - if (n & 2) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - a_offset1 = a_offset; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - - VSEV_FLOAT(b_offset, v1, vl); - - a_offset1 += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c deleted file mode 100644 index 963e1be69..000000000 --- a/kernel/riscv64/gemm_tcopy_2_rvv.c +++ /dev/null @@ -1,108 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset, *b_offset1, *b_offset2; - FLOAT_V_T v1a, v1b, v2a, v2b; - size_t vl; - - //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - b_offset2 = b + m * (n & ~1); - - for(i = (m >> 1); i > 0; i--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 4; - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); - VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); - - VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); - - a_offset1 += vl * 2; - a_offset2 += vl * 2; - b_offset1 += vl * m * 2; - } - - if (n & 1) { - *(b_offset2 + 0) = *(a_offset1 + 0); - *(b_offset2 + 1) = *(a_offset2 + 0); - b_offset2 += 2; - } - } - - if (m & 1) { - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); - - VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); - - a_offset += vl * 2; - b_offset += vl * m * 2; - } - - if (n & 1){ - *(b_offset2 + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c deleted file mode 100644 index ac9974b24..000000000 --- a/kernel/riscv64/gemm_tcopy_4_rvv.c +++ /dev/null @@ -1,236 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; - FLOAT ctemp1, ctemp2, ctemp3, ctemp4; - FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - - //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - b_offset2 = b + m * (n & ~3); - b_offset3 = b + m * (n & ~1); - - for(j = (m >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - b_offset1 = b_offset; - b_offset += 16; - - for(i = (n >> 2); i > 0; i--) { - v1 = VLEV_FLOAT(a_offset1, 4); - v2 = VLEV_FLOAT(a_offset2, 4); - v3 = VLEV_FLOAT(a_offset3, 4); - v4 = VLEV_FLOAT(a_offset4, 4); - - a_offset1 += 4; - a_offset2 += 4; - a_offset3 += 4; - a_offset4 += 4; - - VSEV_FLOAT(b_offset1, v1, 4); - VSEV_FLOAT(b_offset2+4, v2, 4); - VSEV_FLOAT(b_offset2+8, v3, 4); - VSEV_FLOAT(b_offset2+12, v4, 4); - - b_offset1 += m * 4; - } - - if (n & 2) { - v1 = VLEV_FLOAT(a_offset1, 2); - v2 = VLEV_FLOAT(a_offset2, 2); - v3 = VLEV_FLOAT(a_offset3, 2); - v4 = VLEV_FLOAT(a_offset4, 2); - - a_offset1 += 2; - a_offset2 += 2; - a_offset3 += 2; - a_offset4 += 2; - - VSEV_FLOAT(b_offset2, v1, 2); - VSEV_FLOAT(b_offset2+2, v2, 2); - VSEV_FLOAT(b_offset2+4, v3, 2); - VSEV_FLOAT(b_offset2+6, v4, 2); - - b_offset2 += 8; - } - - if (n & 1) { - v1 = VLEV_FLOAT(a_offset1, 1); - v2 = VLEV_FLOAT(a_offset2, 1); - v3 = VLEV_FLOAT(a_offset3, 1); - v4 = VLEV_FLOAT(a_offset4, 1); - - VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); - - b_offset3 += 4; - } - - } - -// TODO cleanup - - if (m & 2){ - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 8; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - ctemp5 = *(a_offset2 + 0); - ctemp6 = *(a_offset2 + 1); - ctemp7 = *(a_offset2 + 2); - ctemp8 = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - *(b_offset1 + 4) = ctemp5; - *(b_offset1 + 5) = ctemp6; - *(b_offset1 + 6) = ctemp7; - *(b_offset1 + 7) = ctemp8; - - b_offset1 += m * 4; - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - - ctemp3 = *(a_offset2 + 0); - ctemp4 = *(a_offset2 + 1); - - a_offset1 += 2; - a_offset2 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - *(b_offset2 + 2) = ctemp3; - *(b_offset2 + 3) = ctemp4; - - b_offset2 += 4; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset2 + 0); - - *(b_offset3 + 0) = ctemp1; - *(b_offset3 + 1) = ctemp2; - b_offset3 += 2; - } - } - - if (m & 1){ - a_offset1 = a_offset; - b_offset1 = b_offset; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - a_offset1 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - b_offset1 += 4 * m; - - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - a_offset1 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - *(b_offset3 + 0) = ctemp1; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c deleted file mode 100644 index ec8961ced..000000000 --- a/kernel/riscv64/gemmkernel_2x2_rvv.c +++ /dev/null @@ -1,214 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG2_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG2_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/2; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c deleted file mode 100644 index aa58bcc76..000000000 --- a/kernel/riscv64/gemmkernel_4x4_rvv.c +++ /dev/null @@ -1,508 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VLSEG4_FLOAT vlseg4e32_v_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m1 -#define VFMACCVF_FLOAT vfmacc_vf_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m1(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VLSEG4_FLOAT vlseg4e64_v_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m1 -#define VFMACCVF_FLOAT vfmacc_vf_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU - - FLOAT_V_T va0, va1, va2, va3; - FLOAT_V_T vb0, vb1, vb2, vb3; - FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; - FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/4; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - C2 = C1 + ldc; - C3 = C2 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - vres8 = VFMVVF_FLOAT(0.0, vlmax); - vres9 = VFMVVF_FLOAT(0.0, vlmax); - vres10 = VFMVVF_FLOAT(0.0, vlmax); - vres11 = VFMVVF_FLOAT(0.0, vlmax); - vres12 = VFMVVF_FLOAT(0.0, vlmax); - vres13 = VFMVVF_FLOAT(0.0, vlmax); - vres14 = VFMVVF_FLOAT(0.0, vlmax); - vres15 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); - vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); - vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); - vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); - - vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); - vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); - vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); - vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); - - ptrba += vl*4; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - C2 += 4; - C3 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - ptrba += vl*2; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - C1 += 2; - C2 += 2; - C3 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); - vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 1; - C1 += 1; - C2 += 1; - C3 += 1; - } - - bb += (bk<<2); - C += (ldc<<2); - } - - if(bn & 2) { - - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); - vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); - vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); - - ptrba += vl*4; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - ptrba += vl*4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c deleted file mode 100644 index 127e76970..000000000 --- a/kernel/riscv64/trmmkernel_2x2_rvv.c +++ /dev/null @@ -1,342 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_2x2.c - - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - BLASLONG off, temp; - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 v_res, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - -#if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; -#else - off = 0; -#endif - - for (j = bn/2; j > 0; j--) - { - C0 = C; - C1 = C0+ldc; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); - C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 2; -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 2; -#endif - C0 = C0+2; - C1 = C1+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 2; -#endif - ptrba += temp; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - C1 = C1+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - if (bn & 1) - { - C0 = C; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off; -#endif - - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - vb0 = VLEV_FLOAT(ptrbb, vl); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); - - ptrba += vl * 2; - ptrbb += vl; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 1; -#endif - ptrba += temp*2; - ptrbb += temp; -#endif -#ifdef LEFT - off += 2; -#endif - - C0 = C0+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off + 1; -#else - temp = off + 1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - ptrba += vl; - ptrbb += vl; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 1; -#endif - ptrba += temp; - ptrbb += temp; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} - diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c deleted file mode 100644 index 3e46c6348..000000000 --- a/kernel/riscv64/trmmkernel_4x4_rvv.c +++ /dev/null @@ -1,881 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMUL_FLOAT vfmul_vv_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMUL_FLOAT vfmul_vv_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_4x4.c - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - - FLOAT_V_T vres0_0; - FLOAT_V_T vres0_1; - FLOAT_V_T vres0_2; - FLOAT_V_T vres0_3; - - FLOAT_V_T vres1_0; - FLOAT_V_T vres1_1; - FLOAT_V_T vres1_2; - FLOAT_V_T vres1_3; - - FLOAT_V_T vres2_0; - FLOAT_V_T vres2_1; - FLOAT_V_T vres2_2; - FLOAT_V_T vres2_3; - - FLOAT_V_T vres3_0; - FLOAT_V_T vres3_1; - FLOAT_V_T vres3_2; - FLOAT_V_T vres3_3; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - - for (j=0; j 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); - vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); - vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); - - ptrba += vl * 4; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - if (!backwards) { - temp = bk-off; - temp = left ? temp - 4 : // number of values in A - temp - 4; // number of values in B - - ptrba += temp*4; // number of values in A - ptrbb += temp*4; // number of values in B - } -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - C2 = C2+4; - C3 = C3+4; - - } - - if ( bm & 2 ) // do any 2x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres2_1 = VFMVVF_FLOAT(0, vlmax); - - vres3_0 = VFMVVF_FLOAT(0, vlmax); - vres3_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+4; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - ptrba += vl * 2; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); - - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - C2 = C2+2; - C3 = C3+2; - - } - - if ( bm & 1 ) // do any 1x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres3_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+4; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - C2 = C2+1; - C3 = C3+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; -#endif - - k = (bk<<2); - bb = bb+k; - i = (ldc<<2); - C = C+i; - } - - for (j=0; j<(bn&2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - - ptrba += vl * 4; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) // do any 2x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+2; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) // do any 1x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*2; -#endif - - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - - ptrba += vl * 4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - ptrba += vl * 2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - - } - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} From 9702d57b11351a5360a2f0326c69c3f550c784d2 Mon Sep 17 00:00:00 2001 From: HellerZheng Date: Wed, 16 Nov 2022 11:11:04 +0800 Subject: [PATCH 005/191] Update Makefile.install --- Makefile.install | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index 168d08f72..87b5bc870 100644 --- a/Makefile.install +++ b/Makefile.install @@ -202,5 +202,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - - From 3918d8504e7720d94221025ae6078a2459ccb104 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 21 Nov 2022 19:06:07 -0800 Subject: [PATCH 006/191] nrm2 simple optimization --- kernel/riscv64/nrm2_rvv.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 3f5d50397..979c31648 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -39,9 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT vfabs_v_f32m8 #define ABS fabsf #else #define VSETVL(n) vsetvl_e64m8(n) @@ -54,9 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT vfabs_v_f64m8 #define ABS fabs #endif @@ -68,12 +64,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if(n == 1) return (ABS(x[0])); FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_max, v_res; - FLOAT scale = 0.0, ssq = 0.0; + FLOAT_V_T_M1 v_res; + FLOAT ssq = 0.0; size_t vlmax = VSETVL_MAX; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); vr = VFMVVF_FLOAT(0, vlmax); @@ -83,9 +78,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLEV_FLOAT(x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } @@ -98,20 +90,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } - } v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - return(scale * sqrt(ssq)); + return sqrt(ssq); } From 387e8970cd8ce581a6c7bc48418860966140f621 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 28 Nov 2022 21:42:29 -0800 Subject: [PATCH 007/191] Fix merge problem; Update compiling COMMON_OPT per review comments. --- Makefile.prebuild | 2 +- Makefile.riscv64 | 6 +++--- common_riscv64.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index e6a8eab59..c4f4a2602 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -56,7 +56,7 @@ TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif ifeq ($(TARGET), x280) -TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif ifeq ($(TARGET), RISCV64_GENERIC) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index d6eaf552d..d091984a6 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,10 +3,10 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math -FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static -endif \ No newline at end of file +endif diff --git a/common_riscv64.h b/common_riscv64.h index 221a79901..2092bd5ab 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) -#include +#include #endif #if defined(x280) From c19dff0a31c58163dc386b0c4270e75f576b97be Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 25 Jan 2023 19:33:32 +0800 Subject: [PATCH 008/191] Fix T-Head RVV intrinsic API changes. --- kernel/riscv64/amax_vector.c | 8 ++++---- kernel/riscv64/amin_vector.c | 8 ++++---- kernel/riscv64/asum_vector.c | 8 ++++---- kernel/riscv64/axpby_vector.c | 16 ++++++++-------- kernel/riscv64/axpy_vector.c | 16 ++++++++-------- kernel/riscv64/copy_vector.c | 16 ++++++++-------- kernel/riscv64/dot_vector.c | 8 ++++---- kernel/riscv64/gemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/gemv_t_vector.c | 8 ++++---- kernel/riscv64/iamax_vector.c | 8 ++++---- kernel/riscv64/iamin_vector.c | 8 ++++---- kernel/riscv64/imax_vector.c | 8 ++++---- kernel/riscv64/imin_vector.c | 8 ++++---- kernel/riscv64/izamax_vector.c | 4 ++-- kernel/riscv64/izamin_vector.c | 4 ++-- kernel/riscv64/max_vector.c | 8 ++++---- kernel/riscv64/min_vector.c | 8 ++++---- kernel/riscv64/nrm2_vector.c | 8 ++++---- kernel/riscv64/rot_vector.c | 16 ++++++++-------- kernel/riscv64/scal_vector.c | 16 ++++++++-------- kernel/riscv64/swap_vector.c | 16 ++++++++-------- kernel/riscv64/symv_L_vector.c | 16 ++++++++-------- kernel/riscv64/symv_U_vector.c | 16 ++++++++-------- kernel/riscv64/zamax_vector.c | 4 ++-- kernel/riscv64/zamin_vector.c | 4 ++-- kernel/riscv64/zasum_vector.c | 8 ++++---- kernel/riscv64/zaxpby_vector.c | 8 ++++---- kernel/riscv64/zaxpy_vector.c | 8 ++++---- kernel/riscv64/zcopy_vector.c | 8 ++++---- kernel/riscv64/zdot_vector.c | 8 ++++---- kernel/riscv64/zgemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/zgemv_t_vector.c | 4 ++-- kernel/riscv64/zhemv_LM_vector.c | 8 ++++---- kernel/riscv64/zhemv_UV_vector.c | 8 ++++---- kernel/riscv64/znrm2_vector.c | 8 ++++---- kernel/riscv64/zrot_vector.c | 16 ++++++++-------- kernel/riscv64/zscal_vector.c | 8 ++++---- kernel/riscv64/zswap_vector.c | 16 ++++++++-------- 38 files changed, 190 insertions(+), 190 deletions(-) diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b778d3e55..1b7799340 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index fd2f83dc9..f9b7defae 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index a82275153..fc73362bc 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 988c57ec2..676dfd474 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 98b9f6814..6f921f2d6 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -30,18 +30,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index a46136d6c..fee5e195d 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c index 64efc6c40..f47e0c0b5 100644 --- a/kernel/riscv64/dot_vector.c +++ b/kernel/riscv64/dot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -45,8 +45,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index 32ca8618b..bb9ab8e5a 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 7683641fa..7d0b70cbb 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index ecb4cd7a9..9fea522f7 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -60,8 +60,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index c72bb94cc..4e81e7848 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index c2d787ab8..ca48a3c48 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index dfe9a3310..2a677098d 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index fdbdc3ae8..66a101566 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 59c720310..818193a9e 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index b988513c9..7f31e9a53 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index be0803df6..14b7e01ed 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c index 2a83e2a52..cf6fdb741 100644 --- a/kernel/riscv64/nrm2_vector.c +++ b/kernel/riscv64/nrm2_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32 #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -55,8 +55,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64 #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c index 9b48d1c69..43a65e552 100644 --- a/kernel/riscv64/rot_vector.c +++ b/kernel/riscv64/rot_vector.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFMSACVF_FLOAT vfmsac_vf_f32m4 @@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFMSACVF_FLOAT vfmsac_vf_f64m4 diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c index 7a3153b7c..8b9ef5a3e 100644 --- a/kernel/riscv64/scal_vector.c +++ b/kernel/riscv64/scal_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #define VFMULVF_FLOAT vfmul_vf_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #define VFMULVF_FLOAT vfmul_vf_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #endif diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index d9421e2f1..b16592808 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index 6588f4dda..58ec17b03 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,10 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 31104eae6..34ff0e30a 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 9dbeba90f..bfb282ae0 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index dc58075ac..d9eca7f10 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 8386ab62e..0d1cc42f1 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 3eca20415..5e6034ac5 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 303d3541e..4ccfe4a81 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 600f02bba..55a480a35 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #endif diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index ec38ed9d2..0900206b3 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index b5ee1f054..3095c28f9 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index e930dc2a2..a7a8a5279 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index 275ee9131..0a284a999 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 2f46977d4..33b7c9c25 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index 59d0e219d..cadabdb75 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -53,8 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index 2fdd8135a..858dfd173 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 64323aa3a..d275b75f8 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index 7550294b5..c1dcaccab 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) From 63cf4d01668f8f6c73a05039bc36785ba78b0940 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Wed, 1 Feb 2023 19:13:44 -0800 Subject: [PATCH 009/191] add riscv level3 C,Z kernel functions. --- kernel/riscv64/KERNEL.x280 | 85 +++- kernel/riscv64/trmm_lncopy_rvv_v1.c | 8 +- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 644 +++--------------------- kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 658 +++---------------------- kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 610 +++-------------------- kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 623 +++-------------------- kernel/riscv64/zgemm_ncopy_4_rvv.c | 121 +++++ kernel/riscv64/zgemm_ncopy_rvv_v1.c | 74 +++ kernel/riscv64/zgemm_tcopy_4_rvv.c | 181 +++++++ kernel/riscv64/zgemm_tcopy_rvv_v1.c | 74 +++ kernel/riscv64/zgemmkernel_rvv_v1x4.c | 475 ++++++++++++++++++ kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 124 +++++ kernel/riscv64/zhemm_utcopy_rvv_v1.c | 120 +++++ kernel/riscv64/zsymm_lcopy_rvv_v1.c | 106 ++++ kernel/riscv64/zsymm_ucopy_rvv_v1.c | 106 ++++ kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 145 ++++++ kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 143 ++++++ kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 144 ++++++ kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 140 ++++++ kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 574 +++++++++++++++++++++ kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 115 +++++ kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 114 +++++ kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 113 +++++ kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 115 +++++ param.h | 10 +- 25 files changed, 3342 insertions(+), 2280 deletions(-) create mode 100644 kernel/riscv64/zgemm_ncopy_4_rvv.c create mode 100644 kernel/riscv64/zgemm_ncopy_rvv_v1.c create mode 100644 kernel/riscv64/zgemm_tcopy_4_rvv.c create mode 100644 kernel/riscv64/zgemm_tcopy_rvv_v1.c create mode 100644 kernel/riscv64/zgemmkernel_rvv_v1x4.c create mode 100644 kernel/riscv64/zhemm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/zhemm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/zsymm_lcopy_rvv_v1.c create mode 100644 kernel/riscv64/zsymm_ucopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmmkernel_rvv_v1x4.c create mode 100644 kernel/riscv64/ztrsm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_utcopy_rvv_v1.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 4d64354fb..217d8534e 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -118,8 +118,8 @@ DGEMVTKERNEL = gemv_t_rvv.c CGEMVTKERNEL = zgemv_t_rvv.c ZGEMVTKERNEL = zgemv_t_rvv.c -CTRMMKERNEL = ztrmmkernel_2x2_rvv.c -ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c +CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c +ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c # SGEMM_UNROLL_N set in params.h ifeq ($(SGEMM_UNROLL_N), 8) @@ -168,17 +168,28 @@ DSYMMUCOPY_M = symm_ucopy_rvv_v1.c DSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMKERNEL = zgemmkernel_rvv_v1x4.c +CGEMMINCOPY = zgemm_ncopy_rvv_v1.c +CGEMMITCOPY = zgemm_tcopy_rvv_v1.c +CGEMMONCOPY = zgemm_ncopy_4_rvv.c +CGEMMOTCOPY = zgemm_tcopy_4_rvv.c -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c + +ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c +ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c +ZGEMMONCOPY = zgemm_ncopy_4_rvv.c +ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c @@ -190,20 +201,25 @@ DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c -TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c -TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c -TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c +ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c +ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c SSYMV_U_KERNEL = symv_U_rvv.c SSYMV_L_KERNEL = symv_L_rvv.c @@ -214,6 +230,27 @@ CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c +ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c + +CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c LSAME_KERNEL = ../generic/lsame.c diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c index 73a8233f8..3457ca3e1 100644 --- a/kernel/riscv64/trmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT vse32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 #define VBOOL_T vbool16_t -#define UINT_V_T vint32m2_t -#define VID_V_UINT vid_v_i32m2 -#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 #define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 11a0398ca..2cba06b38 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m2(n) #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 #define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -88,606 +91,107 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - FLOAT *pa, *pc; + FLOAT aa; + FLOAT* pc; int i, j, k; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug - size_t vl; - FLOAT_V_T va, vc; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - for (j = 0; j < n; j ++) - { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - pa = a; - pc = c + j * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLEV_FLOAT(pc, vl); - va = VLEV_FLOAT(pa, vl); - vc = VFNMSACVF_FLOAT(vc, bb, va, vl); - VSEV_FLOAT(pc, vc, vl); - pa += vl; - pc += vl; - } - } - a -= m; - b -= 2 * n; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - FLOAT aa, bb0, bb1; - FLOAT *pa, *pc, *pc0, *pc1; - FLOAT *pb0, *pb1; - - int i, j, k; - fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + FLOAT_V_T vb, vc; size_t vl; - FLOAT_V_T va, vc0, vc1; a += (m - 1) * m; b += (m - 1) * n; - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/2; j ++) - { - //bb = *(c + i + j * ldc); - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - //bb *= aa; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - //*b = bb; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - - //*(c + i + j * ldc) = bb; - //b ++; - - b += 2; - //pa = a + i + 1; - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - pa = a; - //pc = c + j * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - } - pc += ldc * (n/2) * 2; - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - - a -= m; - b -= 2 * n; - } - -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb0, bb1, bb2, bb3; - FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; - FLOAT *pb0, *pb1, *pb2, *pb3; - - int i, j, k; - - size_t vl; - FLOAT_V_T va, vc0, vc1, vc2, vc3; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/4; j ++) - { - pb0 = pc + j * ldc * 4; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - - b += 4; - - pc0 = c + j * ldc * 4; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - } - } - pc += ldc * (n/4) * 4; - - if (n & 2) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - - *b = bb0; - *(b+1) = bb1; - - *pb0 = bb0; - *pb1 = bb1; - - b += 2; - - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - - a -= m; - b -= 2 * n; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; - FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; - - int i, j, k; - - size_t vl; - FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - a += (m - 1) * m; - b += (m - 1) * n; + for (i = m - 1; i >= 0; i--) { - for (i = m - 1; i >= 0; i--) - { aa = *(a + i); - pc = c + i; - for (j = 0; j < n/8; j ++) - { - pb0 = pc + j * ldc * 8; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - pb4 = pb3 + ldc; - pb5 = pb4 + ldc; - pb6 = pb5 + ldc; - pb7 = pb6 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - bb4 = (*pb4) * aa; - bb5 = (*pb5) * aa; - bb6 = (*pb6) * aa; - bb7 = (*pb7) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - *(b+4) = bb4; - *(b+5) = bb5; - *(b+6) = bb6; - *(b+7) = bb7; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - *pb4 = bb4; - *pb5 = bb5; - *pb6 = bb6; - *pb7 = bb7; - - b += 8; - - pc0 = c + j * ldc * 8; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - pc4 = pc3 + ldc; - pc5 = pc4 + ldc; - pc6 = pc5 + ldc; - pc7 = pc6 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - vc4 = VLEV_FLOAT(pc4, vl); - vc5 = VLEV_FLOAT(pc5, vl); - vc6 = VLEV_FLOAT(pc6, vl); - vc7 = VLEV_FLOAT(pc7, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); - vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); - vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); - vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - VSEV_FLOAT(pc4, vc4, vl); - VSEV_FLOAT(pc5, vc5, vl); - VSEV_FLOAT(pc6, vc6, vl); - VSEV_FLOAT(pc7, vc7, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - pc4 += vl; - pc5 += vl; - pc6 += vl; - pc7 += vl; - } - } - pc += ldc * (n/8) * 8; - - if (n & 4) - { - pb0 = pc + j * ldc * 4; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - - b += 4; - - pc0 = c + j * ldc * 4; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = 0; k < i; k ++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); } - pc += ldc * 4; + pc += vl * ldc; } - - if (n & 2) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - - *b = bb0; - *(b+1) = bb1; - - *pb0 = bb0; - *pb1 = bb1; - - b += 2; - - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - a -= m; b -= 2 * n; } } -#else -static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - - int i, j, k; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) { - - aa = *(a + i); - - for (j = 0; j < n; j ++) { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - for (k = 0; k < i; k ++){ - *(c + k + j * ldc) -= bb * *(a + k); - } - - } - a -= m; - b -= 2 * n; - } - -} - -#endif - #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; - ldc *= 2; - a += (m - 1) * m * 2; - b += (m - 1) * n * 2; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; - for (i = m - 1; i >= 0; i--) { + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); + for (i = m - 1; i >= 0; i--) { - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif + VSSEG2_FLOAT(b, vs1, vs2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + b += vl * 2; - - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - for (k = 0; k < i; k ++){ + for (k = 0; k < i; k ++) { + VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); #ifndef CONJ - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#else - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - } - + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + } + pc += vl * ldc * 2; + } + a -= m * 2; + b -= 4 * n; } - a -= m * 2; - b -= 4 * n; - } - } + #endif diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index 0380bd1bb..492a5631f 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m2(n) #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 #define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -87,468 +90,39 @@ static FLOAT dm1 = -1.; // Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ - FLOAT aa, bb; - FLOAT *pa, *pc; - int i, j, k; - size_t vl; - FLOAT_V_T va, vc; - for (i = 0; i < m; i++) - { - aa = *(a + i); - for (j = 0; j < n; j ++) - { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b++; - pa = a + i + 1; - pc = c + j * ldc + i + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLEV_FLOAT(pc, vl); - va = VLEV_FLOAT(pa, vl); - vc = VFNMSACVF_FLOAT(vc, bb, va, vl); - VSEV_FLOAT(pc, vc, vl); - pa += vl; - pc += vl; - } - } - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb0, bb1; - FLOAT *pa, *pc, *pc0, *pc1; - FLOAT *pb0, *pb1; + FLOAT aa; + FLOAT* pc; int i, j, k; - size_t vl; - FLOAT_V_T va, vc0, vc1; - for (i = 0; i < m; i++) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/2; j ++) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - } - pc += ldc * (n/2) * 2; - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 4 -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - FLOAT aa, bb0, bb1, bb2, bb3; - FLOAT *pa, *pc; - FLOAT *pc0, *pc1, *pc2, *pc3; - FLOAT *pb0, *pb1, *pb2, *pb3; + FLOAT_V_T vb, vc; - int i, j, k; size_t vl; - FLOAT_V_T va; - FLOAT_V_T vc0, vc1, vc2, vc3; - for (i = 0; i < m; i++) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/4; j ++) - { - pb0 = pc; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - b += 4; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - } - } - pc += ldc * (n/4) * 4; - - if (n & 2) - { - pb0 = pc; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ - - FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; - FLOAT *pa, *pc; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + for (i = 0; i < m; i++) { - int i, j, k; - size_t vl; - FLOAT_V_T va; - FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - for (i = 0; i < m; i++) - { aa = *(a + i); - pc = c + i; - for (j = 0; j < n/8; j ++) - { - pb0 = pc + j * ldc * 8; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - pb4 = pb3 + ldc; - pb5 = pb4 + ldc; - pb6 = pb5 + ldc; - pb7 = pb6 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - bb4 = (*pb4) * aa; - bb5 = (*pb5) * aa; - bb6 = (*pb6) * aa; - bb7 = (*pb7) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - *(b+4) = bb4; - *(b+5) = bb5; - *(b+6) = bb6; - *(b+7) = bb7; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - *pb4 = bb4; - *pb5 = bb5; - *pb6 = bb6; - *pb7 = bb7; - b += 8; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - pc4 = pc3 + ldc; - pc5 = pc4 + ldc; - pc6 = pc5 + ldc; - pc7 = pc6 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - vc4 = VLEV_FLOAT(pc4, vl); - vc5 = VLEV_FLOAT(pc5, vl); - vc6 = VLEV_FLOAT(pc6, vl); - vc7 = VLEV_FLOAT(pc7, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); - vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); - vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); - vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - VSEV_FLOAT(pc4, vc4, vl); - VSEV_FLOAT(pc5, vc5, vl); - VSEV_FLOAT(pc6, vc6, vl); - VSEV_FLOAT(pc7, vc7, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - pc4 += vl; - pc5 += vl; - pc6 += vl; - pc7 += vl; - } - } - pc += ldc * (n/8) * 8; - - if (n & 4) - { - pb0 = pc; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - b += 4; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = i + 1; k < m; k++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); } - pc += ldc * 4; + pc += vl * ldc; } - - if (n & 2) - { - pb0 = pc; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - a += m; } } @@ -557,146 +131,60 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - - int i, j, k; - - for (i = 0; i < m; i++) { - - aa = *(a + i); - - for (j = 0; j < n; j ++) { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - for (k = i + 1; k < m; k ++){ - *(c + k + j * ldc) -= bb * *(a + k); - } - - } - a += m; - } -} - -#endif - -#else - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; - - ldc *= 2; - - for (i = 0; i < m; i++) { - - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); - - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); - -#ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; -#else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; -#endif - - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - for (k = i + 1; k < m; k ++){ -#ifndef CONJ - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#else - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#endif - } - - } - a += m * 2; - } -} - - -static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - FLOAT *pa, *pc; - - int i, j, k; - - size_t vl; - FLOAT_V_T va0, va1, vc0, vc1; + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; - ldc *= 2; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; - for (i = 0; i < m; i++) { + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); + ldc *= 2; - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); + for (i = 0; i < m; i++) { + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif + VSSEG2_FLOAT(b, vs1, vs2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + b += vl * 2; - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - pa = a + (i + 1) * 2; - pc = c + j * ldc + (i + 1) * 2; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, pa, vl); - VLSEG2_FLOAT(&vc0, &vc1, pc, vl); + for (k = i + 1; k < m; k++) { + VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); #ifndef CONJ - vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); - vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); -#else - vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); - vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); - vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pc, vc0, vc1, vl); - pa += vl * 2; - pc += vl * 2; + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + } + pc += vl * ldc * 2; } - } + + a += m * 2; } - a += m * 2; - } } #endif @@ -714,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug j = (n >> GEMM_UNROLL_N_SHIFT); diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 41368be60..4751ae012 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -32,28 +32,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t #define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t #define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif - static FLOAT dm1 = -1.; #ifdef CONJ @@ -86,569 +90,99 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - FLOAT *pb, *pc; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - - for (j = 0; j < m; j ++) - { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - pb = b + i + 1; - pc = c + j + (i + 1) *ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLSEV_FLOAT(pc, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); - VSSEV_FLOAT(pc, stride_ldc, vc, vl); - pb += vl; - pc ++; - } - } - b += n; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT bb; + FLOAT *pci, *pcj; - FLOAT aa0, aa1, bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pc0, *pc1; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/2; j ++) - { - pa0 = pc + j * 2; - pa1 = pc + j * 2 + 1; - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - pb += vl; - pc0++; - pc1++; - } - } - pc += (m/2)*2; - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b += n; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT_V_T va, vc; - FLOAT bb; - FLOAT aa0, aa1, aa2, aa3; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3; - FLOAT *pc0, *pc1, *pc2, *pc3; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3; + for (i = 0; i < n; i++) { - for (i = 0; i < n; i++) - { bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/4; j ++) - { - pa0 = pc + j * 4; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - - a += 4; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - } - pc += (m/4)*4; - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = i + 1; k < n; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); } + pcj += vl; } b += n; } } -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT bb; - FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/8; j ++) - { - pa0 = pc + j * 8; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - pa4 = pa3 + 1; - pa5 = pa4 + 1; - pa6 = pa5 + 1; - pa7 = pa6 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - aa4 = *pa4 * bb; - aa5 = *pa5 * bb; - aa6 = *pa6 * bb; - aa7 = *pa7 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - *pa4 = aa4; - *pa5 = aa5; - *pa6 = aa6; - *pa7 = aa7; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - *(a + 4)= aa4; - *(a + 5)= aa5; - *(a + 6)= aa6; - *(a + 7)= aa7; - - a += 8; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - pc4 = pa4 + ldc; - pc5 = pa5 + ldc; - pc6 = pa6 + ldc; - pc7 = pa7 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); - vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); - vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); - vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); - vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); - vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); - vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); - VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); - VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); - VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - pc4++; - pc5++; - pc6++; - pc7++; - } - } - pc += (m/8)*8; - - if (m & 4) - { - pa0 = pc; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - - a += 4; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - pc += 4; - } - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b += n; - } -} #else -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - int i, j, k; - - for (i = 0; i < n; i++) { - - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - for (k = i + 1; k < n; k ++){ - *(c + j + k * ldc) -= aa * *(b + k); - } - - } - b += n; - } -} - -#endif +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { -#else + FLOAT bb1, bb2; -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *pci, *pcj; - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; + int i, j, k; - int i, j, k; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; - ldc *= 2; + size_t vl; - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) { - bb1 = *(b + i * 2 + 0); - bb2 = *(b + i * 2 + 1); + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); - for (j = 0; j < m; j ++) { - aa1 = *(c + j * 2 + 0 + i * ldc); - aa2 = *(c + j * 2 + 1 + i * ldc); + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + VLSEG2_FLOAT(&va1, &va2, pci, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = -aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif + VSSEG2_FLOAT(a, vs1, vs2, vl); + VSSEG2_FLOAT(pci, vs1, vs2, vl); + a += vl * 2; + pci += vl * 2; - *(a + 0) = cc1; - *(a + 1) = cc2; - *(c + j * 2 + 0 + i * ldc) = cc1; - *(c + j * 2 + 1 + i * ldc) = cc2; - a += 2; - - for (k = i + 1; k < n; k ++){ + for (k = i + 1; k < n; k ++){ + VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); #ifndef CONJ - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #else - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - } - + VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + } + pcj += vl * 2; + } + b += n * 2; } - b += n * 2; - } } #endif @@ -666,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug j = (n >> GEMM_UNROLL_N_SHIFT); diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index 459c1663a..93a9e6916 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -32,25 +32,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t #define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSEV_FLOAT vse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t #define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSEV_FLOAT vse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -86,497 +85,38 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - FLOAT *pb, *pc; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc; - - a += (n - 1) * m; - b += (n - 1) * n; - - for (i = n - 1; i >= 0; i--) { - - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - pb = b; - pc = c + j; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLSEV_FLOAT(pc, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); - VSSEV_FLOAT(pc, stride_ldc, vc, vl); - pb += vl; - pc++; - } - } - b -= n; - a -= 2 * m; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT bb; + FLOAT *pci, *pcj; - FLOAT aa0, aa1, bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pc0, *pc1; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1; - - a += (n - 1) * m; - b += (n - 1) * n; + FLOAT_V_T va, vc; - for (i = n - 1; i >= 0; i--) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/2; j ++) - { - pa0 = pc + j * 2; - pa1 = pc + j * 2 + 1; - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = c + j * 2; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - pb += vl; - pc0++; - pc1++; - } - } - pc += (m/2)*2; - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b -= n; - a -= 2 * m; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa0, aa1, aa2, aa3; - FLOAT bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3; - FLOAT *pc0, *pc1, *pc2, *pc3; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3; a += (n - 1) * m; b += (n - 1) * n; - for (i = n - 1; i >= 0; i--) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/4; j ++) - { - pa0 = pc + j * 4; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - a += 4; - - pb = b; - pc0 = c + j * 4; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - } - pc += (m/4)*4; - - if (m & 2) - { - pa0 = pc + j * 2; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = c + j * 4; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b -= n; - a -= 2 * m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; - FLOAT bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - a += (n - 1) * m; - b += (n - 1) * n; + for (i = n - 1; i >= 0; i--) { - for (i = n - 1; i >= 0; i--) - { bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/8; j ++) - { - pa0 = pc + j * 8; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - pa4 = pa3 + 1; - pa5 = pa4 + 1; - pa6 = pa5 + 1; - pa7 = pa6 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - aa4 = *pa4 * bb; - aa5 = *pa5 * bb; - aa6 = *pa6 * bb; - aa7 = *pa7 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - *pa4 = aa4; - *pa5 = aa5; - *pa6 = aa6; - *pa7 = aa7; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - *(a + 4)= aa4; - *(a + 5)= aa5; - *(a + 6)= aa6; - *(a + 7)= aa7; - a += 8; - - pb = b; - pc0 = c + j * 8; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - pc4 = pc3 + 1; - pc5 = pc4 + 1; - pc6 = pc5 + 1; - pc7 = pc6 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); - vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); - vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); - vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); - vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); - vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); - vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); - VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); - VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); - VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - pc4++; - pc5++; - pc6++; - pc7++; - } - } - pc += (m/8)*8; - - if (m & 4) - { - pa0 = pc; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - a += 4; - - pb = b; - pc0 = pc - i * ldc; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - pc += 4; - } - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = pc - i * ldc; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = 0; k < i; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); } + pcj += vl; } b -= n; a -= 2 * m; @@ -587,92 +127,65 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - - int i, j, k; - - a += (n - 1) * m; - b += (n - 1) * n; + FLOAT bb1, bb2; - for (i = n - 1; i >= 0; i--) { + FLOAT *pci, *pcj; - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - for (k = 0; k < i; k ++){ - *(c + j + k * ldc) -= aa * *(b + k); - } - - } - b -= n; - a -= 2 * m; - } - -} - -#endif - -#else - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; - - ldc *= 2; + int i, j, k; - a += (n - 1) * m * 2; - b += (n - 1) * n * 2; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; - for (i = n - 1; i >= 0; i--) { + size_t vl; - bb1 = *(b + i * 2 + 0); - bb2 = *(b + i * 2 + 1); + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; - for (j = 0; j < m; j ++) { + for (i = n - 1; i >= 0; i--) { - aa1 = *(c + j * 2 + 0 + i * ldc); - aa2 = *(c + j * 2 + 1 + i * ldc); + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + VLSEG2_FLOAT(&va1, &va2, pci, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = - aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif + VSSEG2_FLOAT(a, vs1, vs2, vl); + VSSEG2_FLOAT(pci, vs1, vs2, vl); + a += vl * 2; + pci += vl * 2; - *(a + 0) = cc1; - *(a + 1) = cc2; - - *(c + j * 2 + 0 + i * ldc) = cc1; - *(c + j * 2 + 1 + i * ldc) = cc2; - a += 2; - - for (k = 0; k < i; k ++){ + for (k = 0; k < i; k ++){ + VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); #ifndef CONJ - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #else - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - } - + VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + } + pcj += vl * 2; + } + b -= n * 2; + a -= 4 * m; } - b -= n * 2; - a -= 4 * m; - } - } #endif @@ -689,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug kk = n - offset; c += n * ldc * COMPSIZE; diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c new file mode 100644 index 000000000..389ee5d57 --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +// Optimizes the implementation in ../generic/zgemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + + FLOAT *boffset; + + FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42; + size_t vl; + + aoffset = a; + boffset = b; + lda *= 2; + + for (j = (n >> 2); j > 0; j--) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + VLSEG2_FLOAT(&v31, &v32, aoffset3, vl); + VLSEG2_FLOAT(&v41, &v42, aoffset4, vl); + + VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + aoffset3 += vl * 2; + aoffset4 += vl * 2; + boffset += vl * 8; + } + } + + if (n & 2) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + + VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + boffset += vl * 4; + } + } + + if (n & 1) { + aoffset1 = aoffset; + aoffset += lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + + VSSEG2_FLOAT(boffset, v11, v12, vl); + + aoffset1 += vl * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..df039bab6 --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0, v1; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda * 2; + + for(i = m; i > 0; i--) { + VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, v0, v1, vl); + + a_offset1 += 2; + b_offset += vl * 2; + } + } + return 0; +} + diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c new file mode 100644 index 000000000..1b34039c8 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + size_t vl; + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 2); i > 0; i--) { + vl = 4; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 4; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 1) { + vl = 4; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 8; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 2); i > 0; i--) { + vl = 2; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 2; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 1) { + vl = 2; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + //aoffset1 += 2; + boffset3 += 4; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 2); i > 0; i--) { + vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 2) { + vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 1) { + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..7622fb810 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0, v1; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl * 2; + + for(i = m; i > 0; i--) { + VLSEG2_FLOAT(&v0, &v1, aoffset1, vl); + VSSEG2_FLOAT(boffset, v0, v1, vl); + + aoffset1 += lda * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c new file mode 100644 index 000000000..50e29222f --- /dev/null +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -0,0 +1,475 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + VLSEG2_FLOAT(&va2, &va3, C1, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + VLSEG2_FLOAT(&va0, &va1, C2, vl); + VLSEG2_FLOAT(&va2, &va3, C3, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + VSSEG2_FLOAT(C2, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + VSSEG2_FLOAT(C3, va2, va3, vl); + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + VLSEG2_FLOAT(&va2, &va3, C1, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + C0 += vl * 2; + C1 += vl * 2; + } + + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + C0 += vl * 2; + } + + bb += bk << 1; + C += ldc << 1; + } + return 0; +} + diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..cf466d3fa --- /dev/null +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + posX * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); + VLSEG2_FLOAT(&va10, &va11, ao1, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c new file mode 100644 index 000000000..6209f5417 --- /dev/null +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -0,0 +1,120 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + VBOOL_T vbool_gt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + posX * lda * 2; + ao2 = a + posX * 2 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); + VLSEG2_FLOAT(&va20, &va21, ao2, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c new file mode 100644 index 000000000..df5c916a5 --- /dev/null +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + (posX) * lda * 2; + + for (i = m; i > 0; i--, offset--) { + + VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); + VLSEG2_FLOAT(&va10, &va11, ao1, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c new file mode 100644 index 000000000..dcf2b081a --- /dev/null +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + (posX + 0) * lda * 2; + ao2 = a + posX * 2 + 0 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); + VLSEG2_FLOAT(&va20, &va21, ao2, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c new file mode 100644 index 000000000..afd694408 --- /dev/null +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -0,0 +1,145 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT vid_v_i32m2 +#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T va0, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += 2; + b += vl * 2; + + X ++; + i ++; + } + else if (X < posY) + { + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..c7d593949 --- /dev/null +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -0,0 +1,143 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T va0, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X < posY) + { + //va1 = VLEV_FLOAT(ao, vl); + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + //va1 = VLEV_FLOAT(ao, vl); + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + //VSEV_FLOAT(b, vb, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..3c70b6385 --- /dev/null +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -0,0 +1,144 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + FLOAT *ao; + + FLOAT_V_T va0, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += 2; + b += vl * 2; + + X++; + i++; + } + else if (X > posY) + { + ao += lda * 2; + b += vl * 2; + + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..706782cf0 --- /dev/null +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -0,0 +1,140 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T va0, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X > posY) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c new file mode 100644 index 000000000..27409ec25 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -0,0 +1,574 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + BLASLONG off, temp; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*4*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + va0 = VFMULVF_FLOAT(vres4, alphar, vl); + va1 = VFMULVF_FLOAT(vres5, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + VSSEG2_FLOAT(C2, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres6, alphar, vl); + va3 = VFMULVF_FLOAT(vres7, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + VSSEG2_FLOAT(C3, va2, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*4*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + C1 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + ptrbb += 2; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + bb += bk << 1; + C += ldc << 1; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..b7ccb1eb3 --- /dev/null +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..911b81de5 --- /dev/null +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + b += vl * 2; + ao += lda * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..db075c29b --- /dev/null +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..e121c6273 --- /dev/null +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va0, va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + ao += lda * 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + + return 0; +} diff --git a/param.h b/param.h index 62b675d6c..236f50075 100644 --- a/param.h +++ b/param.h @@ -3055,11 +3055,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 #define DGEMM_DEFAULT_UNROLL_MN 32 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 160 #define DGEMM_DEFAULT_P 160 From 240695862984d4de845f1c42821a883946932df7 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 24 Feb 2023 10:44:55 +0000 Subject: [PATCH 010/191] * update intrinsics to match latest spec at https://github.com/riscv-non-isa/rvv-intrinsic-doc (in particular, __riscv_ prefixes for rvv intrinsics) * fix multiple numerical stability and corner case issues * add a script to generate arbitrary gemm kernel shapes * add a generic zvl256b target to demonstrate large gemm kernel unrolls --- common_riscv64.h | 15 +- cpuid_riscv64.c | 10 +- kernel/generic/trmmkernel_16x8.c | 3676 ++++++++++++++++++++ kernel/generic/zlaswp_ncopy_8.c | 1051 ++++++ kernel/riscv64/KERNEL.RISCV64_ZVL256B | 199 ++ kernel/riscv64/amax_vector.c | 231 +- kernel/riscv64/amin_vector.c | 252 +- kernel/riscv64/asum_vector.c | 99 +- kernel/riscv64/axpby_vector.c | 47 +- kernel/riscv64/axpy_vector.c | 42 +- kernel/riscv64/cgemm_kernel_8x8_zvl256b.c | 1931 ++++++++++ kernel/riscv64/copy_vector.c | 39 +- kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c | 2007 +++++++++++ kernel/riscv64/dgemm_kernel_8x8_zvl256b.c | 860 +++++ kernel/riscv64/dot.c | 2 +- kernel/riscv64/dot_vector.c | 86 +- kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c | 1068 ++++++ kernel/riscv64/gemv_n_vector.c | 24 +- kernel/riscv64/gemv_t_vector.c | 91 +- kernel/riscv64/generate_kernel.py | 670 ++++ kernel/riscv64/iamax_vector.c | 180 +- kernel/riscv64/iamin_vector.c | 160 +- kernel/riscv64/imax_vector.c | 124 +- kernel/riscv64/imin_vector.c | 185 +- kernel/riscv64/izamax_vector.c | 277 +- kernel/riscv64/izamin_vector.c | 275 +- kernel/riscv64/max_vector.c | 77 +- kernel/riscv64/min_vector.c | 77 +- kernel/riscv64/nrm2_vector.c | 342 +- kernel/riscv64/nrm2_vector_dot.c | 8 +- kernel/riscv64/rot_vector.c | 42 +- kernel/riscv64/scal_vector.c | 83 +- kernel/riscv64/sgemm_kernel_16x8_zvl256b.c | 1081 ++++++ kernel/riscv64/strmm_kernel_16x8_zvl256b.c | 1330 +++++++ kernel/riscv64/sum_vector.c | 114 + kernel/riscv64/swap_vector.c | 54 +- kernel/riscv64/symv_L_vector.c | 82 +- kernel/riscv64/symv_U_vector.c | 86 +- kernel/riscv64/zamax_vector.c | 90 +- kernel/riscv64/zamin_vector.c | 89 +- kernel/riscv64/zasum_vector.c | 107 +- kernel/riscv64/zaxpby_vector.c | 32 +- kernel/riscv64/zaxpy_vector.c | 20 +- kernel/riscv64/zcopy_vector.c | 12 +- kernel/riscv64/zdot_vector.c | 60 +- kernel/riscv64/zgemm_kernel_8x4_zvl256b.c | 1253 +++++++ kernel/riscv64/zgemm_kernel_generic.c | 140 + kernel/riscv64/zgemv_n_vector.c | 28 +- kernel/riscv64/zgemv_t_vector.c | 88 +- kernel/riscv64/zhemv_LM_vector.c | 60 +- kernel/riscv64/zhemv_UV_vector.c | 60 +- kernel/riscv64/znrm2_vector.c | 365 +- kernel/riscv64/zrot_vector.c | 38 +- kernel/riscv64/zscal_vector.c | 32 +- kernel/riscv64/zsum_vector.c | 131 + kernel/riscv64/zswap_vector.c | 50 +- kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c | 1337 +++++++ param.h | 39 + 58 files changed, 18634 insertions(+), 2374 deletions(-) create mode 100644 kernel/generic/trmmkernel_16x8.c create mode 100644 kernel/generic/zlaswp_ncopy_8.c create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL256B create mode 100644 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c create mode 100755 kernel/riscv64/generate_kernel.py create mode 100644 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c create mode 100644 kernel/riscv64/strmm_kernel_16x8_zvl256b.c create mode 100644 kernel/riscv64/sum_vector.c create mode 100644 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c create mode 100644 kernel/riscv64/zgemm_kernel_generic.c create mode 100644 kernel/riscv64/zsum_vector.c create mode 100644 kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c diff --git a/common_riscv64.h b/common_riscv64.h index 2092bd5ab..de79c8cab 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,12 +91,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) -#include -#endif - -#if defined(x280) -#include +#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v) +# include +# if !defined(DOUBLE) +# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v) +# else +# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v) +# endif +#else +# define EXTRACT_FLOAT(v) (v[0]) #endif #endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 5326787e6..1b6b62f21 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -70,14 +70,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_GENERIC 0 -#define CPU_C910V 1 -#define CPU_x280 2 +#define CPU_GENERIC 0 +#define CPU_C910V 1 +#define CPU_RISCV64_ZVL256B 2 static char *cpuname[] = { "RISCV64_GENERIC", - "C910V" - "x280" + "C910V", + "CPU_RISCV64_ZVL256B" }; int detect(void){ diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c new file mode 100644 index 000000000..5412eab70 --- /dev/null +++ b/kernel/generic/trmmkernel_16x8.c @@ -0,0 +1,3676 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + FLOAT res4_4; + FLOAT res4_5; + FLOAT res4_6; + FLOAT res4_7; + + FLOAT res4_8; + FLOAT res4_9; + FLOAT res4_10; + FLOAT res4_11; + FLOAT res4_12; + FLOAT res4_13; + FLOAT res4_14; + FLOAT res4_15; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + FLOAT res5_4; + FLOAT res5_5; + FLOAT res5_6; + FLOAT res5_7; + + FLOAT res5_8; + FLOAT res5_9; + FLOAT res5_10; + FLOAT res5_11; + FLOAT res5_12; + FLOAT res5_13; + FLOAT res5_14; + FLOAT res5_15; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + FLOAT res6_4; + FLOAT res6_5; + FLOAT res6_6; + FLOAT res6_7; + + FLOAT res6_8; + FLOAT res6_9; + FLOAT res6_10; + FLOAT res6_11; + FLOAT res6_12; + FLOAT res6_13; + FLOAT res6_14; + FLOAT res6_15; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + FLOAT res7_4; + FLOAT res7_5; + FLOAT res7_6; + FLOAT res7_7; + + FLOAT res7_8; + FLOAT res7_9; + FLOAT res7_10; + FLOAT res7_11; + FLOAT res7_12; + FLOAT res7_13; + FLOAT res7_14; + FLOAT res7_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, A3, A4, A5, A6, A7, A8; + FLOAT B1, B2, B3, B4, B5, B6, B7, B8; + + FLOAT A9, A10, A11, A12, A13, A14, A15, A16; + FLOAT B9, B10, B11, B12, B13, B14, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + for( int pass = 0; pass < 2; ++pass ) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + b1 += 4*lda; + b2 += 4*lda; + b3 += 4*lda; + b4 += 4*lda; + b5 += 4*lda; + b6 += 4*lda; + b7 += 4*lda; + b8 += 4*lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + buffer += 16; + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + + if (n & 4) { + { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + + buffer += 16; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + } + } //if (n & 4) + + if (n & 2) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else { + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else { + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + // buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B new file mode 100644 index 000000000..d8690682f --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -0,0 +1,199 @@ +SAMAXKERNEL = amax_vector.c +DAMAXKERNEL = amax_vector.c +CAMAXKERNEL = zamax_vector.c +ZAMAXKERNEL = zamax_vector.c + +SAMINKERNEL = amin_vector.c +DAMINKERNEL = amin_vector.c +CAMINKERNEL = zamin_vector.c +ZAMINKERNEL = zamin_vector.c + +SMAXKERNEL = max_vector.c +DMAXKERNEL = max_vector.c + +SMINKERNEL = min_vector.c +DMINKERNEL = min_vector.c + +ISAMAXKERNEL = iamax_vector.c +IDAMAXKERNEL = iamax_vector.c +ICAMAXKERNEL = izamax_vector.c +IZAMAXKERNEL = izamax_vector.c + +ISAMINKERNEL = iamin_vector.c +IDAMINKERNEL = iamin_vector.c +ICAMINKERNEL = izamin_vector.c +IZAMINKERNEL = izamin_vector.c + +ISMAXKERNEL = imax_vector.c +IDMAXKERNEL = imax_vector.c + +ISMINKERNEL = imin_vector.c +IDMINKERNEL = imin_vector.c + +SASUMKERNEL = asum_vector.c +DASUMKERNEL = asum_vector.c +CASUMKERNEL = zasum_vector.c +ZASUMKERNEL = zasum_vector.c + +SSUMKERNEL = sum_vector.c +DSUMKERNEL = sum_vector.c +CSUMKERNEL = zsum_vector.c +ZSUMKERNEL = zsum_vector.c + +SAXPYKERNEL = axpy_vector.c +DAXPYKERNEL = axpy_vector.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c + +SCOPYKERNEL = copy_vector.c +DCOPYKERNEL = copy_vector.c +CCOPYKERNEL = zcopy_vector.c +ZCOPYKERNEL = zcopy_vector.c + +SDOTKERNEL = dot_vector.c +DDOTKERNEL = dot_vector.c +CDOTKERNEL = zdot_vector.c +ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = nrm2_vector.c +DNRM2KERNEL = nrm2_vector.c +CNRM2KERNEL = znrm2_vector.c +ZNRM2KERNEL = znrm2_vector.c + +SROTKERNEL = rot_vector.c +DROTKERNEL = rot_vector.c +CROTKERNEL = zrot_vector.c +ZROTKERNEL = zrot_vector.c + +SSCALKERNEL = scal_vector.c +DSCALKERNEL = scal_vector.c +CSCALKERNEL = zscal_vector.c +ZSCALKERNEL = zscal_vector.c + +SSWAPKERNEL = swap_vector.c +DSWAPKERNEL = swap_vector.c +CSWAPKERNEL = zswap_vector.c +ZSWAPKERNEL = zswap_vector.c + +SGEMVNKERNEL = gemv_n_vector.c +DGEMVNKERNEL = gemv_n_vector.c +CGEMVNKERNEL = zgemv_n_vector.c +ZGEMVNKERNEL = zgemv_n_vector.c + +SGEMVTKERNEL = gemv_t_vector.c +DGEMVTKERNEL = gemv_t_vector.c +CGEMVTKERNEL = zgemv_t_vector.c +ZGEMVTKERNEL = zgemv_t_vector.c + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_vector.c +SSYMV_L_KERNEL = symv_L_vector.c +DSYMV_U_KERNEL = symv_U_vector.c +DSYMV_L_KERNEL = symv_L_vector.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + +CHEMV_L_KERNEL = zhemv_LM_vector.c +CHEMV_M_KERNEL = zhemv_LM_vector.c +CHEMV_U_KERNEL = zhemv_UV_vector.c +CHEMV_V_KERNEL = zhemv_UV_vector.c +ZHEMV_L_KERNEL = zhemv_LM_vector.c +ZHEMV_M_KERNEL = zhemv_LM_vector.c +ZHEMV_U_KERNEL = zhemv_UV_vector.c +ZHEMV_V_KERNEL = zhemv_UV_vector.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index 1b7799340..81a39af32 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -28,36 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -65,103 +66,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_zero; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_zero = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -169,94 +95,27 @@ asm volatile( BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index f9b7defae..c8ba75f4a 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -26,232 +26,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include -#include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - if (n <= 0 || inc_x <= 0) return(0.0); - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT minf=0.0; + if (n <= 0 || inc_x <= 0) return(minf); + + minf = *x; + x += inc_x; + --n; + if (n == 0) return(minf); + unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(minf, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -64,75 +67,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ gvl = VSETVL(n); - v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -93,13 +91,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_y == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -119,14 +116,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_x == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -146,15 +141,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else{ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -174,9 +167,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } } return(dot); diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c new file mode 100644 index 000000000..b1739f248 --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c @@ -0,0 +1,1068 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='dtrmm_kernel_8x8_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(LEFT) != defined(TRANSA) + #define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j: + tmp_unroll_j = int(tmp_unroll_j / 2) + + if tmp_unroll_i < a_regs or tmp_unroll_j < N: + dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i) + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + if ti == 0 and tj==0: + dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + else: + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;") + dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;") + + with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): + for i in range(N): + dest.write("B{i}r = B[bi+{i}*2+0];", i=i) + dest.write("B{i}i = B[bi+{i}*2+1];", i=i) + dest.write("bi += {N}*2;") + dest.write() + + for i in range(a_regs): + dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) + + dest.write("ai += {M}*2;") + dest.write() + + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign + # so accumulating them directly to ACC would lose precision when ACC is larger + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);") + dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);") + + dest.write() + dest.write("{index_type} ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + if TRMM: + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );") + dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );") + else: + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );") + dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );") + dest.write("{increment}") + + if not TRMM: + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );") + + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );") + + if not TRMM: + dest.write() + dest.write("ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);") + dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);") + dest.write("{increment}") + +#----------------------------------------------------------------------- +def generate_gemm_kernel( settings, OUTPUT ): + if settings['conjugate'].value: + ERROR('conjugate gemm not yet supported') + + is_complex = settings['complex'].value + generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real + dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings }) + + M = settings['M'].value + N = settings['N'].value + vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + a_regs = max(int(M/vlenmax), 1) + + accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + required_regs = accumulation_regs + a_regs + if is_complex: + required_regs = required_regs * 2 + 2 + dest.write(''' +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfmacc{tail_policy} +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfmsac{tail_policy} +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfnmsac{tail_policy} +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfnmacc{tail_policy} +#endif +'''.format(tail_policy=settings['tail_policy'].value)) + + + if required_regs > 32: + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '') + )) + + TRMM = (settings['op'].value == 'trmm') + if TRMM: + with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"): + dest.write("#define BACKWARDS") + + dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})", + alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'), + trmm = (', BLASLONG offset' if TRMM else '') + ) + + with dest.block("{{", "}}", elt_size='*2' if is_complex else ''): + if settings['trace'].value: + dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);") + dest.write("{index_type} gvl = 0;") + dest.write("{index_type} m_top = 0;") + dest.write("{index_type} n_top = 0;") + + dest.write() + dest.write() + dest.write("// -- MAIN PASS") + + with dest.block("for ({index_type} j=0; j 0 ): + with dest.map(N=N_tail): + dest.write() + dest.write() + dest.write("// -- tails for N={N}") + with dest.block("if( N & {N} ) {{", "}}" ): + if settings['trace'].value: + dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) + dest.write("m_top = 0;") + with dest.block("for ({index_type} i=0; i M_tail_min ): + with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + a_regs = max( 1, int(M_tail/vlenmax) ) + vlen = int(M_tail/a_regs) + dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen) + + generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs ) + dest.write( "m_top += {M_tail};" ) + + M_tail = int( M_tail / 2 ) + + while( M_tail > 0 ): + with dest.block("if( M & {M_tail} ) {{", "}}", + M_tail=M_tail, + N=N, + result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) + ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + for r in range(M_tail * N * (2 if is_complex else 1)): + dest.write("{result_t} result{r} = 0;", + r=r + ) + + dest.write("{index_type} ai=m_top*K{elt_size};") + dest.write("{index_type} bi=n_top*K{elt_size};") + + if TRMM: + with dest.map(M=M_tail, N=N): + generate_trmm_block( dest ) + + with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ): + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2 + )) + dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2 + )) + else: + dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format( + dest=ki*M_tail+kj, kj=kj, ki=ki + )) + dest.write("ai+={M_tail}{elt_size};") + dest.write("bi+={N}{elt_size};") + + dest.write("{index_type} ci=n_top*ldc+m_top;") + if is_complex: + dest.write("{result_t} Cr, Ci;") + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + if TRMM: + dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + else: + dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj) + dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj) + dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1) + dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0) + dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj ) + dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj ) + else: + op = '' if TRMM else '+' + dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};", + ki=ki, kj=kj, op=op, dest=ki*M_tail+kj + ) + dest.write("m_top+={M_tail};") + + M_tail = int(M_tail/2) + + +#----------------------------------------------------------------------- +class Setting(object): + def __init__( self, value, convert = None ): + self._value = value + self._convert = convert + + @classmethod + def ENUM( cls, *values ): + def closure( values ): + return lambda value: values[value.lower()] + return closure( { v.lower():v for v in values } ) + + @classmethod + def BOOL( cls, value ): + return value.lower().startswith('t') or value == '1' + + @property + def value( self ): + return self._value + + @property + def configurable( self ): + return self._convert is not None + + @value.setter + def value( self, value ): + self._value = self._convert( value ) + + def __str__( self ): + return str(self._value) + +#----------------------------------------------------------------------- +def main(): + settings = { + 'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ), + 'M': Setting( 16, int ), + 'N': Setting( 4, int ), + 'reg_width_bits': Setting( 256, int ), + 'LMUL': Setting( 1, int ), + 'M_tail_scalar_from':Setting( 2, int ), + 'cpu': Setting( 'zvl256b', str ), + 'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ), + 'force_acc_double': Setting( False, Setting.BOOL ), + 'complex': Setting( False, Setting.BOOL ), + 'conjugate': Setting( False, Setting.BOOL ), + 'index_type': Setting( 'BLASLONG', str ), + 'trace': Setting( False, Setting.BOOL ), + 'output': Setting( None, str ), + 'tail_policy': Setting( '', str ), # _ta, if toolchain supports it + '__riscv_': Setting( '__riscv_', str), + } + + for item in sys.argv[1:]: + try: + name, value = tuple(item.split( '=', 1 )) + except: + ERROR("couldn't parse {}, expected arguments of the form name=value".format(item)) + + if name not in settings: + ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name ) + +"options (and current defaults) are\n{}".format( + " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()])) + ) + + try: + settings[name].value = value + except: + import traceback + traceback.print_exc() + ERROR("couldn't parse {}".format(item)) + + if settings['output'].value is None: + if settings['complex'].value: + prefix = 'z' if settings['param_precision'].value == 'double' else 'c' + else: + prefix = 'd' if settings['param_precision'].value == 'double' else 's' + settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format( + prefix, + settings['op'], + settings['M'], + settings['N'], + settings['cpu'] + )) + + if settings['param_precision'].value == 'double': + settings['param_scalar_t'] = Setting( 'double' ) + settings['ELEN_PARAM'] = Setting(64) + else: + settings['param_scalar_t'] = Setting( 'float' ) + settings['ELEN_PARAM'] = Setting(32) + + settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + + settings['ELEN_ACC'] = settings['ELEN_PARAM'] + settings['LMUL_ACC'] = Setting(settings['LMUL'].value) + widen = '' + + if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'): + settings['ELEN_ACC'] = Setting(64) + settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2) + settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy'])) + widen = 'w' + + settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + + settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL'])) + settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC'])) + settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'])) + + + to_stdout = (settings['output'].value == '-') + if not to_stdout: + print("Writing {}".format(settings['output'].value), file=sys.stderr) + + with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file: + def OUTPUT(*args, **kwargs): + print(*args, file=destination_file, **kwargs) + + OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable]))) + OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable]))) + + OUTPUT('#include "common.h"') + OUTPUT("\n") + + if settings['op'].value in ('gemm', 'trmm'): + generate_gemm_kernel(settings, OUTPUT) + else: + ERROR("unsupported kernel type {}".format(settings['op'])) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index 9fea522f7..92880fbcf 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -27,118 +27,111 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFABS_FLOAT __riscv_vfabs_v_f64m4 +#define VCOMPRESS __riscv_vcompress_vm_u64m4 +#define VMV_X __riscv_vmv_x_s_u64m4_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFABS_FLOAT __riscv_vfabs_v_f32m4 +#define VCOMPRESS __riscv_vcompress_vm_u32m4 +#define VMV_X __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); + + gvl = VSETVL(n); + UINT_V_T vid = VIDV_UINT(gvl); if(inc_x == 1){ - gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLEV_FLOAT(&x[j], gvl); + v_max = VFABS_FLOAT(v_max, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -146,51 +139,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 4e81e7848..0503f9948 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -31,85 +31,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -117,29 +111,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLEV_FLOAT(&x[j], gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -151,12 +145,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -165,33 +157,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index ca48a3c48..e24f9fd48 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -31,68 +31,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); - FLOAT maxf=-FLT_MAX; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -104,32 +102,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -145,37 +145,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index 2a677098d..a60bd3d07 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -31,122 +31,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -159,59 +142,39 @@ asm volatile( for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 66a101566..89cd510c1 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -27,241 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VSEVU_UINT vse64_v_u64m8 +#define VSEVU_UINT __riscv_vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT __riscv_vse32_v_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; - FLOAT_V_T vx0, vx1, v_max; + FLOAT_V_T vx, vx2, v_max; UINT_V_T v_max_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); + //index where element greater than v_max - mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - vx0 = VFMVVF_FLOAT(0, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); - mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_max_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = VFMVFS_FLOAT(v_res); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_max = VFADDVV_FLOAT(v_max, vx2, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); + if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); - mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } - return(max_index+1); -} - + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 818193a9e..74daf32b8 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,235 +31,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t #define VSEVU_UINT vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT __riscv_vse32_v_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; - FLOAT_V_T vx0, vx1, v_min; + FLOAT_V_T vx, vx2, v_min; UINT_V_T v_min_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; - v_min_index = VMVVX_UINT(0, gvl); + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; + v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); - //index where element less than v_min - mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); - mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_min_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = VFMVFS_FLOAT(v_res); - if(cur_minf < minf){ + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_min = VFADDVV_FLOAT(v_min, vx2, gvl); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); - mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; - + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } - return(min_index+1); -} - + return(min_index+1); +} diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 7f31e9a53..97f602e51 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_max = VFMAXVV_FLOAT(v_max, v1, gvl); j += gvl * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 14b7e01ed..77bf19b9d 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, v1, gvl); j += gvl * 2; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); } for(;j= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + + if(inc_x == 1){ + gvl = VSETVL(n); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; if (n < 0) return(0); + + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } + if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 - +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 - +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) +#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -70,10 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -84,9 +87,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); mask0 = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); + v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); mask1 = VMFLTVF_FLOAT(v1, 0, gvl); - v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); + v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); v0 = VFADDVV_FLOAT(v0, v1, gvl); v_max = VFMAXVV_FLOAT(v_max, v0, gvl); @@ -94,22 +97,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; ix += inc_xv; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); if(j maxf) - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index d9eca7f10..095b1c3df 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -29,38 +29,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) +#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -69,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -83,9 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); mask0 = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); + v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); mask1 = VMFLTVF_FLOAT(v1, 0, gvl); - v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); + v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); v0 = VFADDVV_FLOAT(v0, v1, gvl); v_min = VFMINVV_FLOAT(v_min, v0, gvl); @@ -93,21 +99,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; ix += inc_xv; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); if(j -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -67,12 +73,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; gvl = VSETVL(n2); @@ -81,26 +84,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i N ) + n_packing >>= 1; + + BLASLONG m_packing = UNROLL_M; + BLASLONG m_top = 0; + while (m_top < M) + { + while( m_top+m_packing > M ) + m_packing >>= 1; + + BLASLONG ai = K*m_top*2; + BLASLONG bi = K*n_top*2; + + BLASLONG pass_K = K; + + + #ifdef TRMMKERNEL + #ifdef LEFT + BLASLONG off = offset + m_top; + #else + BLASLONG off = -offset + n_top; + #endif + #ifdef BACKWARDS + ai += off * m_packing*2; + bi += off * n_packing*2; + pass_K -= off; + #else + #ifdef LEFT + pass_K = off + m_packing; + #else + pass_K = off + n_packing; + #endif + #endif + #endif + + memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) ); + + for (BLASLONG k=0; k 0 ){ // scale change? + // find largest element in v0 and v1 + v_res = VFREDMAX( v0, v_z0, gvl ); + v_res = VFREDMAX( v1, v_res, gvl ); + FLOAT const largest_elt = EXTRACT_FLOAT( v_res ); + + v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt + v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt) + v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt) + + v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale } - //ssq in vector vr: vr[0] - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - //total ssq now - ssq += VFMVFS_FLOAT(v_res); - //tail - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); - //if scale change - mask = VMFGTVF_FLOAT(v0, scale, gvl); - index = VMFIRSTM(mask, gvl); - if(index == -1){//no elements greater than scale - if(scale != 0.0){ - v0 = VFDIVVF_FLOAT(v0, scale, gvl); - vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); + MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl ); + MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl ); + v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl ); + v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl ); + v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl ); + v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl ); + + idx += inc_x * gvl * 2; + } + + v_res = VFREDUSUM(v_ssq, v_z0, gvl); + FLOAT ssq = EXTRACT_FLOAT(v_res); + FLOAT scale = EXTRACT_FLOAT0_V(v_scale); + + //finish any tail using scalar ops + i*=gvl; + if(i + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); + + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 64 +# else +# define ELEN 32 +# define MLEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 16 +# else +# define ELEN 32 +# define MLEN 8 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i Date: Wed, 1 Mar 2023 17:40:42 +0000 Subject: [PATCH 011/191] factoring riscv64/dot.c fix into separate PR as requested --- kernel/riscv64/dot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index bf55998ca..46a84ad18 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; - if ( n < 1 ) return(dot); + if ( n < 0 ) return(dot); while(i < n) { From 1374a2d08b078451dcfaf723614ea13e441e1d06 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Sun, 19 Mar 2023 23:59:03 -0700 Subject: [PATCH 012/191] This PR adapts latest spec changes Add prefix (_riscv) for all riscv intrinsics Update some intrinsics' parameter, like vfredxxxx, vmerge --- kernel/riscv64/amax_rvv.c | 54 +++++------ kernel/riscv64/amin_rvv.c | 54 +++++------ kernel/riscv64/asum_rvv.c | 54 +++++------ kernel/riscv64/axpby_rvv.c | 36 +++---- kernel/riscv64/axpy_rvv.c | 28 +++--- kernel/riscv64/copy_rvv.c | 24 ++--- kernel/riscv64/dot_rvv.c | 66 ++++++------- kernel/riscv64/gemm_beta_rvv.c | 24 ++--- kernel/riscv64/gemm_ncopy_8_rvv.c | 28 +++--- kernel/riscv64/gemm_ncopy_rvv_v1.c | 20 ++-- kernel/riscv64/gemm_tcopy_8_rvv.c | 44 ++++----- kernel/riscv64/gemm_tcopy_rvv_v1.c | 16 ++-- kernel/riscv64/gemmkernel_rvv_v1x8.c | 24 ++--- kernel/riscv64/gemv_n_rvv.c | 28 +++--- kernel/riscv64/gemv_t_rvv.c | 53 +++++------ kernel/riscv64/iamax_rvv.c | 115 +++++++++++----------- kernel/riscv64/iamin_rvv.c | 117 +++++++++++------------ kernel/riscv64/imax_rvv.c | 113 +++++++++++----------- kernel/riscv64/imin_rvv.c | 113 +++++++++++----------- kernel/riscv64/izamax_rvv.c | 127 ++++++++++++------------- kernel/riscv64/izamin_rvv.c | 121 ++++++++++++----------- kernel/riscv64/max_rvv.c | 50 +++++----- kernel/riscv64/min_rvv.c | 50 +++++----- kernel/riscv64/nrm2_rvv.c | 46 ++++----- kernel/riscv64/rot_rvv.c | 36 +++---- kernel/riscv64/scal_rvv.c | 32 +++---- kernel/riscv64/sum_rvv.c | 50 +++++----- kernel/riscv64/swap_rvv.c | 28 +++--- kernel/riscv64/symm_lcopy_rvv_v1.c | 50 +++++----- kernel/riscv64/symm_ucopy_rvv_v1.c | 50 +++++----- kernel/riscv64/symv_L_rvv.c | 81 ++++++++-------- kernel/riscv64/symv_U_rvv.c | 81 ++++++++-------- kernel/riscv64/trmm_lncopy_rvv_v1.c | 48 +++++----- kernel/riscv64/trmm_ltcopy_rvv_v1.c | 44 ++++----- kernel/riscv64/trmm_uncopy_rvv_v1.c | 48 +++++----- kernel/riscv64/trmm_utcopy_rvv_v1.c | 44 ++++----- kernel/riscv64/trmmkernel_rvv_v1x8.c | 28 +++--- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 42 ++++---- kernel/riscv64/trsm_lncopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_ltcopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_uncopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_utcopy_rvv_v1.c | 40 ++++---- kernel/riscv64/zamax_rvv.c | 58 +++++------ kernel/riscv64/zamin_rvv.c | 58 +++++------ kernel/riscv64/zasum_rvv.c | 51 +++++----- kernel/riscv64/zaxpby_rvv.c | 52 +++++----- kernel/riscv64/zaxpy_rvv.c | 32 +++---- kernel/riscv64/zcopy_rvv.c | 44 ++++----- kernel/riscv64/zdot_rvv.c | 65 +++++++------ kernel/riscv64/zgemm_beta_rvv.c | 32 +++---- kernel/riscv64/zgemm_ncopy_4_rvv.c | 24 ++--- kernel/riscv64/zgemm_ncopy_rvv_v1.c | 16 ++-- kernel/riscv64/zgemm_tcopy_4_rvv.c | 40 ++++---- kernel/riscv64/zgemm_tcopy_rvv_v1.c | 16 ++-- kernel/riscv64/zgemmkernel_rvv_v1x4.c | 36 +++---- kernel/riscv64/zgemv_n_rvv.c | 48 +++++----- kernel/riscv64/zgemv_t_rvv.c | 61 ++++++------ kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 84 ++++++++-------- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 84 ++++++++-------- kernel/riscv64/znrm2_rvv.c | 66 ++++++------- kernel/riscv64/zrot_rvv.c | 52 +++++----- kernel/riscv64/zscal_rvv.c | 44 ++++----- kernel/riscv64/zsum_rvv.c | 47 +++++---- kernel/riscv64/zswap_rvv.c | 24 ++--- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 61 ++++++------ kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 60 ++++++------ kernel/riscv64/ztrmmkernel_2x2_rvv.c | 60 ++++++------ kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 40 ++++---- kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 36 +++---- 79 files changed, 2013 insertions(+), 2031 deletions(-) diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c index c9c6e7f73..be0bdbea0 100644 --- a/kernel/riscv64/amax_rvv.c +++ b/kernel/riscv64/amax_rvv.c @@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c index 370b6c338..d4926084b 100644 --- a/kernel/riscv64/amin_rvv.c +++ b/kernel/riscv64/amin_rvv.c @@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c index 4f711c9be..691591e22 100644 --- a/kernel/riscv64/asum_rvv.c +++ b/kernel/riscv64/asum_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -93,7 +93,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); asumf = VFMVFS_FLOAT_M1(v_res); return(asumf); } diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index 7c35c563d..a1dbdb0e4 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #endif int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c index 3986f4e21..8bc2f30de 100644 --- a/kernel/riscv64/axpy_rvv.c +++ b/kernel/riscv64/axpy_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c index 5d5a8bd04..041fd2dae 100644 --- a/kernel/riscv64/copy_rvv.c +++ b/kernel/riscv64/copy_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c index 60dcac2f5..3276695b6 100644 --- a/kernel/riscv64/dot_rvv.c +++ b/kernel/riscv64/dot_rvv.c @@ -37,24 +37,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( n <= 0 ) return(dot); - size_t vlmax = vsetvlmax_e64m8(); - vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); + size_t vlmax = __riscv_vsetvlmax_e64m8(); + vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax); if(inc_x == 1 && inc_y == 1) { for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vle32_v_f32m4(x, vl); - vfloat32m4_t vy = vle32_v_f32m4(y, vl); + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vle64_v_f64m8(x, vl); - vfloat64m8_t vy = vle64_v_f64m8(y, vl); + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } @@ -63,18 +63,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vle32_v_f32m4(x, vl); - vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vle64_v_f64m8(x, vl); - vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } else if (1 == inc_y) { @@ -82,18 +82,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_x = inc_x * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); - vfloat32m4_t vy = vle32_v_f32m4(y, vl); + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); - vfloat64m8_t vy = vle64_v_f64m8(y, vl); + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } else { @@ -102,25 +102,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); - vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); - vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } - vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); - vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); - dot = vfmv_f_s_f64m1_f64(vec_sum); + vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); + dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); return(dot); } diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c index 34d1ea078..f3cf6491d 100644 --- a/kernel/riscv64/gemm_beta_rvv.c +++ b/kernel/riscv64/gemm_beta_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 #endif // Optimizes the implementation in ../generic/gemm_beta.c diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c index 525b223c2..3030d67fb 100644 --- a/kernel/riscv64/gemm_ncopy_8_rvv.c +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif // Optimizes the implementation in ../generic/gemm_ncopy_8.c diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c index 2c5230752..2d6db15e5 100644 --- a/kernel/riscv64/gemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 81c1f962b..080a87312 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEV_FLOAT vlse32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEV_FLOAT vlse64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c index a291b70b8..c5fb6479f 100644 --- a/kernel/riscv64/gemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c index 5cd509f93..471b3158f 100644 --- a/kernel/riscv64/gemmkernel_rvv_v1x8.c +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #endif int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c index 9d2dee615..1366eb5ad 100644 --- a/kernel/riscv64/gemv_n_rvv.c +++ b/kernel/riscv64/gemv_n_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c index a80af81b6..f0c834866 100644 --- a/kernel/riscv64/gemv_t_rvv.c +++ b/kernel/riscv64/gemv_t_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -63,7 +63,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va, vx, vr; FLOAT_V_T_M1 v_res, v_z0; size_t vlmax = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL_MAX; @@ -83,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO vr = VFMACCVV_FLOAT(vr, va, vx, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); *y += alpha * VFMVFS_FLOAT_M1(v_res); y += inc_y; a += lda; @@ -107,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO vr = VFMACCVV_FLOAT(vr, va, vx, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); *y += alpha * VFMVFS_FLOAT_M1(v_res); y += inc_y; a += lda; diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index 8b33b3bcb..ef7850a55 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -28,57 +28,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -134,16 +134,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_z0; + FLOAT_V_T_M1 v_res; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_z0 = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index 585b37186..56a086fed 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -29,57 +29,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -135,16 +135,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index d84ad968e..5b60a56f7 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_min; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index fb734f6f8..b49544a1b 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index 9cb332cbb..e61d0cbec 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -28,63 +28,63 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define MASK_T vbool16_t -#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 -#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 -#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMAXVV_FLOAT vfmax_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFIRSTM vfirst_m_b16 -#define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT vid_v_u64m4_m -#define VIDV_UINT vid_v_u64m4 -#define VADDVX_MASK_UINT vadd_vx_u64m4_m -#define VADDVX_UINT vadd_vx_u64m4 -#define VMVVX_UINT vmv_v_x_u64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 -#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 #else -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 -#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 -#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMAXVV_FLOAT vfmax_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT vid_v_u32m4_m -#define VIDV_UINT vid_v_u32m4 -#define VADDVX_MASK_UINT vadd_vx_u32m4_m -#define VADDVX_UINT vadd_vx_u32m4 -#define VMVVX_UINT vmv_v_x_u32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 -#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); @@ -138,24 +138,23 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); } } - FLOAT_V_T_M1 v_res, v_z0; + FLOAT_V_T_M1 v_res; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_z0 = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index 69771e5aa..297b3c99a 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -29,59 +29,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 -#define MASK_T vbool16_t -#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 -#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 -#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMINVV_FLOAT vfmin_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFIRSTM vfirst_m_b16 -#define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT vid_v_u64m4_m -#define VIDV_UINT vid_v_u64m4 -#define VADDVX_MASK_UINT vadd_vx_u64m4_m -#define VADDVX_UINT vadd_vx_u64m4 -#define VMVVX_UINT vmv_v_x_u64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 -#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 #else -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 -#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 -#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMINVV_FLOAT vfmin_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT vid_v_u32m4_m -#define VIDV_UINT vid_v_u32m4 -#define VADDVX_MASK_UINT vadd_vx_u32m4_m -#define VADDVX_UINT vadd_vx_u32m4 -#define VMVVX_UINT vmv_v_x_u32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 -#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -145,16 +145,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c index 5b1380d2b..9315321f4 100644 --- a/kernel/riscv64/max_rvv.c +++ b/kernel/riscv64/max_rvv.c @@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c index bddcc0ba7..158b682fd 100644 --- a/kernel/riscv64/min_rvv.c +++ b/kernel/riscv64/min_rvv.c @@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 979c31648..42abfa119 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -29,30 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #define ABS fabsf #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #define ABS fabs #endif @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c index 7bf5e4270..90f81d5e2 100644 --- a/kernel/riscv64/rot_rvv.c +++ b/kernel/riscv64/rot_rvv.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index d2c0378bf..2e2cfd31e 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c index 1db0d09dd..9715faf22 100644 --- a/kernel/riscv64/sum_rvv.c +++ b/kernel/riscv64/sum_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -89,7 +89,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); sumf = VFMVFS_FLOAT_M1(v_res); return(sumf); } diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c index 2cf92f6ad..893d70554 100644 --- a/kernel/riscv64/swap_rvv.c +++ b/kernel/riscv64/swap_rvv.c @@ -28,23 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c index f0def9617..a615db44d 100644 --- a/kernel/riscv64/symm_lcopy_rvv_v1.c +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif // Optimizes the implementation in ../generic/symm_lcopy_4.c @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); VSEV_FLOAT(b, vb, vl); b += vl; diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c index 958506df3..464f97b3a 100644 --- a/kernel/riscv64/symm_ucopy_rvv_v1.c +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif // Optimizes the implementation in ../generic/symm_ucopy_4.c @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); VSEV_FLOAT(b, vb, vl); b += vl; diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index 737abaae3..e87ab22ae 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -28,43 +28,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T_M1 vfloat32m1_t -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T_M1 vfloat64m1_t -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT_V_T_M1 v_res, v_z0; size_t vlmax = VSETVL_MAX_M1, vl; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL_MAX; @@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vr = VFMACCVV_FLOAT(vr, vx, va, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[j] += alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); jy += inc_y; @@ -172,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[j] += alpha * VFMVFS_FLOAT_M1(v_res); jx += inc_x; @@ -211,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); jx += inc_x; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index cb923be5d..3fbc33c89 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -29,43 +29,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T_M1 vfloat32m1_t -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T_M1 vfloat64m1_t -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT *a_ptr = a; FLOAT_V_T_M1 v_res, v_z0; size_t vl_max = VSETVL_MAX_M1, vl; - v_res = VFMVVF_FLOAT_M1(0, vl_max); v_z0 = VFMVVF_FLOAT_M1(0, vl_max); vl_max = VSETVL_MAX; @@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], vl); vr = VFMACCVV_FLOAT(vr, vx, va, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -171,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -209,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c index 3457ca3e1..4135a9b62 100644 --- a/kernel/riscv64/trmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c @@ -116,10 +116,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLSEV_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao++; diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c index 2fe8cf79e..580714fde 100644 --- a/kernel/riscv64/trmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c @@ -111,10 +111,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLEV_FLOAT(ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao += lda; diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c index b64cd840d..852ab7f11 100644 --- a/kernel/riscv64/trmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c @@ -114,10 +114,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLSEV_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao++; diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c index b96daae5b..e0b6d362d 100644 --- a/kernel/riscv64/trmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -32,27 +32,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c @@ -113,10 +113,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLEV_FLOAT(ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao += lda; diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c index 97b14650c..393b24bce 100644 --- a/kernel/riscv64/trmmkernel_rvv_v1x8.c +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 2cba06b38..886af0c3b 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index 492a5631f..ddeef966c 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 4751ae012..4c83bbaa3 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif static FLOAT dm1 = -1.; diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index 93a9e6916..b368eefb9 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -28,28 +28,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c index bacfb2b08..41c84be25 100644 --- a/kernel/riscv64/trsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c index 0fc7c9f24..003bd3465 100644 --- a/kernel/riscv64/trsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif #ifndef UNIT diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c index ee869a795..6cca5d49c 100644 --- a/kernel/riscv64/trsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c index a324b0fa6..bc058525f 100644 --- a/kernel/riscv64/trsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index 1917042be..615b7519c 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -106,7 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index 3f027383a..a0d36d46f 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -105,7 +105,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c index 7876646b3..1d2f0e1fe 100644 --- a/kernel/riscv64/zasum_rvv.c +++ b/kernel/riscv64/zasum_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -99,9 +99,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); asumf += VFMVFS_FLOAT_M1(v_res); return(asumf); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index 66f52d9d0..e0da55311 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -33,33 +33,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFMSACVF_FLOAT vfmsac_vf_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFMSACVF_FLOAT vfmsac_vf_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c index 777bcb728..3f75898e0 100644 --- a/kernel/riscv64/zaxpy_rvv.c +++ b/kernel/riscv64/zaxpy_rvv.c @@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c index 5d8322bbb..bd94810ce 100644 --- a/kernel/riscv64/zcopy_rvv.c +++ b/kernel/riscv64/zcopy_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_M8(n) vsetvl_e32m8(n) -#define FLOAT_V_T_M8 vfloat32m8_t -#define VLEV_FLOAT_M8 vle32_v_f32m8 -#define VSEV_FLOAT_M8 vse32_v_f32m8 - -#define VSETVL_M4(n) vsetvl_e32m4(n) -#define FLOAT_V_T_M4 vfloat32m4_t -#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 -#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 +#define VSETVL_M8(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 __riscv_vle32_v_f32m8 +#define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T_M4 vfloat32m4_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL_M8(n) vsetvl_e64m8(n) -#define FLOAT_V_T_M8 vfloat64m8_t -#define VLEV_FLOAT_M8 vle64_v_f64m8 -#define VSEV_FLOAT_M8 vse64_v_f64m8 - -#define VSETVL_M4(n) vsetvl_e64m4(n) -#define FLOAT_V_T_M4 vfloat64m4_t -#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 -#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 +#define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 __riscv_vle64_v_f64m8 +#define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T_M4 vfloat64m4_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index 7eae6f608..1543c513d 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMSACVV_FLOAT vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMSACVV_FLOAT vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) @@ -72,7 +72,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; FLOAT_V_T_M1 v_res, v_z0; size_t vlmax_m1 = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); size_t vlmax = VSETVL_MAX; @@ -161,9 +160,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA } } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax); CREAL(result) = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax); CIMAG(result) = VFMVFS_FLOAT_M1(v_res); return(result); diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c index a89752d18..b94b5f4bf 100644 --- a/kernel/riscv64/zgemm_beta_rvv.c +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -39,23 +39,23 @@ #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFSUBVV_FLOAT vfsub_vv_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFSUBVV_FLOAT vfsub_vv_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c index 389ee5d57..d50a4b8d5 100644 --- a/kernel/riscv64/zgemm_ncopy_4_rvv.c +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif // Optimizes the implementation in ../generic/zgemm_ncopy_4.c diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c index df039bab6..1d3b8d3b7 100644 --- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -29,15 +29,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index 1b34039c8..8c35b5616 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c index 7622fb810..7a085269c 100644 --- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c index 50e29222f..41399cf79 100644 --- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c index 2eeb61b45..4a40c30a7 100644 --- a/kernel/riscv64/zgemv_n_rvv.c +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index b682d5cd8..15795cc3a 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -73,7 +73,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; size_t vlmax = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL(m); @@ -105,9 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) @@ -149,9 +148,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index cf466d3fa..79b20a646 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VMSLT_VX_INT vmslt_vx_i32m2_b16 -#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VMSLT_VX_INT vmslt_vx_i64m2_b32 -#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #endif @@ -104,13 +104,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index 6209f5417..a86815275 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VMSLT_VX_INT vmslt_vx_i32m2_b16 -#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VMSLT_VX_INT vmslt_vx_i64m2_b32 -#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #endif @@ -101,13 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index 921ddb8cb..5f7873b5a 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -28,35 +28,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT vfabs_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT vfabs_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif // TODO: Should single precision use the widening MAC, or perhaps all should be double? @@ -85,10 +85,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); vr = VFMACCVV_FLOAT(vr, v1, v1, vl); } @@ -103,16 +103,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); vr = VFMACCVV_FLOAT(vr, v1, v1, vl); } } - v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); scale = VFMVFS_FLOAT_M1(v_max); diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c index 68066a00b..ee81bfe91 100644 --- a/kernel/riscv64/zrot_rvv.c +++ b/kernel/riscv64/zrot_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 079c36a2d..779fab68c 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index 3928fbe27..44df112c6 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -88,9 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); sumf += VFMVFS_FLOAT_M1(v_res); return(sumf); diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c index 86f9103d3..17b7b9f43 100644 --- a/kernel/riscv64/zswap_rvv.c +++ b/kernel/riscv64/zswap_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index df5c916a5..0f9e04869 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) @@ -91,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index dcf2b081a..fdc693700 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif @@ -92,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index afd694408..7276618c5 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vint32m2_t -#define VID_V_UINT vid_v_i32m2 -#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT __riscv_vid_v_i32m2 +#define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -121,12 +121,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += 2; diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c index c7d593949..72e8f2ce2 100644 --- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -30,33 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -117,14 +117,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON //va1 = VLEV_FLOAT(ao, vl); VLSEG2_FLOAT(&va0, &va1, ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - //VSEV_FLOAT(b, vb, vl); VSSEG2_FLOAT(b, va0, va1, vl); ao += lda * 2; b += vl * 2; diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c index 3c70b6385..e6d36c86d 100644 --- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -120,12 +120,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += 2; diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c index 706782cf0..7085cfc37 100644 --- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -32,33 +32,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -117,12 +117,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSEG2_FLOAT(&va0, &va1, ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += lda * 2; diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c index 3486a4648..399124d2e 100644 --- a/kernel/riscv64/ztrmmkernel_2x2_rvv.c +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEG4_FLOAT __riscv_vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEG4_FLOAT __riscv_vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif // Optimizes the implementation in ../generic/ztrmmkernel_2x2.c diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c index 27409ec25..92b4b855b 100644 --- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c index b7ccb1eb3..383cb883f 100644 --- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c index 911b81de5..f57e9f1de 100644 --- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c index db075c29b..be3613429 100644 --- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -30,25 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c index e121c6273..b1f5ef8f0 100644 --- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif From 6b74bee2f9d7272f1932a9ba9bbd1bda6c122fbf Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 27 Mar 2023 18:59:24 -0700 Subject: [PATCH 013/191] Update TARGET=x280 description. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ecb46178..1f1c0f3ed 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ``` (also known to work on C906) -- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. +- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. ```sh make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran ``` From 18d7afe69daa196902cd68b63cc381aaafc9d26e Mon Sep 17 00:00:00 2001 From: sh-zheng <2294474733@qq.com> Date: Sat, 20 May 2023 01:19:44 +0800 Subject: [PATCH 014/191] Add rvv support for zsymv and active rvv support for zhemv --- kernel/riscv64/KERNEL.x280 | 17 ++- kernel/riscv64/zhemv_LM_rvv.c | 198 +++++++++++++++++++++++++++++++++ kernel/riscv64/zhemv_UV_rvv.c | 199 ++++++++++++++++++++++++++++++++++ kernel/riscv64/zsymv_L_rvv.c | 179 ++++++++++++++++++++++++++++++ kernel/riscv64/zsymv_U_rvv.c | 177 ++++++++++++++++++++++++++++++ 5 files changed, 766 insertions(+), 4 deletions(-) create mode 100644 kernel/riscv64/zhemv_LM_rvv.c create mode 100644 kernel/riscv64/zhemv_UV_rvv.c create mode 100644 kernel/riscv64/zsymv_L_rvv.c create mode 100644 kernel/riscv64/zsymv_U_rvv.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 217d8534e..86708fe01 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -225,10 +225,19 @@ SSYMV_U_KERNEL = symv_U_rvv.c SSYMV_L_KERNEL = symv_L_rvv.c DSYMV_U_KERNEL = symv_U_rvv.c DSYMV_L_KERNEL = symv_L_rvv.c -CSYMV_U_KERNEL = ../generic/zsymv_k.c -CSYMV_L_KERNEL = ../generic/zsymv_k.c -ZSYMV_U_KERNEL = ../generic/zsymv_k.c -ZSYMV_L_KERNEL = ../generic/zsymv_k.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c new file mode 100644 index 000000000..e025120e5 --- /dev/null +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + jx = 0; + jy = 0; + ja = 0; + for(j = 0; j < offset; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + ix = jx + inc_x2; + iy = jy + inc_y2; + ia = ja + 2; + i = j + 1; + len = m - i; + if(len > 0){ + gvl = VSETVL(len); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + if(i < m){ + gvl = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 += VFMVFS_FLOAT(v_res); + } + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c new file mode 100644 index 000000000..0e1ea5436 --- /dev/null +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = VSETVL(j); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + if(i < j){ + gvl = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 += VFMVFS_FLOAT(v_res); + } + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c new file mode 100644 index 000000000..3bf621094 --- /dev/null +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; + + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = VSETVL(len); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + + if(i < m){ + gvl = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] += VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] += VFMVFS_FLOAT_M1(v_res); + } + } + y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; + + jx += inc_x; + jy += inc_y; + a_ptr += 2 * lda; + } + + return(0); +} + diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c new file mode 100644 index 000000000..de1564f75 --- /dev/null +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = VSETVL(j); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + + if(i < j){ + gvl = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] += VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] += VFMVFS_FLOAT_M1(v_res); + } + } + + y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0]; + + a_ptr += 2 * lda; + jx += inc_x; + jy += inc_y; + } + + return(0); +} + From d3bf5a5401e623e107a23fb70151c7102cbd14c7 Mon Sep 17 00:00:00 2001 From: sh-zheng <2294474733@qq.com> Date: Mon, 22 May 2023 22:39:45 +0800 Subject: [PATCH 015/191] Combine two reduction operations of zhe/symv into one, with tail undisturbed setted. --- kernel/riscv64/zhemv_LM_rvv.c | 68 +++++++++++++++++------------------ kernel/riscv64/zhemv_UV_rvv.c | 68 +++++++++++++++++------------------ kernel/riscv64/zsymv_L_rvv.c | 50 +++++++++++++------------- kernel/riscv64/zsymv_U_rvv.c | 52 +++++++++++++-------------- 4 files changed, 119 insertions(+), 119 deletions(-) diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c index e025120e5..95c6a377c 100644 --- a/kernel/riscv64/zhemv_LM_rvv.c +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m1() @@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -143,49 +147,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 = VFMVFS_FLOAT(v_res); + if(i < m){ - gvl = VSETVL(m-i); - va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); - va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); - vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); #ifndef HEMVREV - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #else - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #endif - VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); - VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); #ifndef HEMVREV - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #else - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #endif - - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 += VFMVFS_FLOAT(v_res); } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c index 0e1ea5436..ec06622fc 100644 --- a/kernel/riscv64/zhemv_UV_rvv.c +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m1() @@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -142,49 +146,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 = VFMVFS_FLOAT(v_res); + if(i < j){ - gvl = VSETVL(j-i); - va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); - va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); - vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); #ifndef HEMVREV - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #else - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #endif - VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); - VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); #ifndef HEMVREV - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #else - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #endif - - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 += VFMVFS_FLOAT(v_res); } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); } y[jy] += temp_r1 * a_ptr[ja]; y[jy+1] += temp_i1 * a_ptr[ja]; diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c index 3bf621094..cefdea7f6 100644 --- a/kernel/riscv64/zsymv_L_rvv.c +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -57,6 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -133,38 +137,34 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] = VFMVFS_FLOAT_M1(v_res); if(i < m){ - gvl = VSETVL(m-i); - vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); - vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); - va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); - va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + unsigned int gvl_rem = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); - vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); - vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); - VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); - VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); - vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); - vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); - vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); - vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); - vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); - vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] += VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] += VFMVFS_FLOAT_M1(v_res); } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); } y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c index de1564f75..67b5a649c 100644 --- a/kernel/riscv64/zsymv_U_rvv.c +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -56,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -129,39 +133,35 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] = VFMVFS_FLOAT_M1(v_res); if(i < j){ - gvl = VSETVL(j-i); - vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); - vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); - va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); - va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); - vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); - vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); - VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); - VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); - vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); - vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); - vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); - vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); - vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); - vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] += VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] += VFMVFS_FLOAT_M1(v_res); - } + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); } y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; From 0954746380a54c4f5c45f892dabef6b7c9aa93c3 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Sun, 4 Jun 2023 20:06:58 -0700 Subject: [PATCH 016/191] remove argument unused during compilation. fix wrong vr = VFMVVF_FLOAT(0, vl); --- Makefile.riscv64 | 2 +- kernel/riscv64/symv_L_rvv.c | 4 ---- kernel/riscv64/symv_U_rvv.c | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index d091984a6..ce7a27141 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,7 +3,7 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_GENERIC) diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index e87ab22ae..b27db2e37 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -94,7 +94,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = (m-i); k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLEV_FLOAT(&y[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -157,7 +155,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = (m-i); k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); inc_xv = inc_x * vl; va = VLEV_FLOAT(&a_ptr[i], vl); @@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vl = VSETVL(k); inc_xv = inc_x * vl; inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 3fbc33c89..7e45b1a01 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -95,7 +95,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = j; k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); vy = VLEV_FLOAT(&y[i], vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -158,7 +156,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_xv = inc_x * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLEV_FLOAT(&y[i], vl); va = VLEV_FLOAT(&a_ptr[i], vl); @@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vl = VSETVL(k); inc_xv = inc_x * vl; inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); From e1958eb70529c36d7dc4f3baf9e7bf37524053ab Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Wed, 5 Jul 2023 11:34:00 +0100 Subject: [PATCH 017/191] Fixes RVV masked intrinsics for iamax/iamin/imax/imin kernels Changes masked intrinsics from _m to _mu and reintroduces maskedoff argument. --- kernel/riscv64/iamax_rvv.c | 16 ++++++++-------- kernel/riscv64/iamin_rvv.c | 16 ++++++++-------- kernel/riscv64/imax_rvv.c | 16 ++++++++-------- kernel/riscv64/imin_rvv.c | 16 ++++++++-------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index ef7850a55..d3508a91d 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index 56a086fed..ae1d4f726 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -46,9 +46,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -72,9 +72,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index 5b60a56f7..33250568d 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index b49544a1b..4ce49c3af 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); From 1e4a3a2b5e111a6a94eb53946fa92c1715c5dd5e Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Wed, 12 Jul 2023 12:55:50 +0100 Subject: [PATCH 018/191] Fixes RVV masked intrinsics for izamax/izamin kernels --- kernel/riscv64/izamax_rvv.c | 18 +++++++++--------- kernel/riscv64/izamin_rvv.c | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e61d0cbec..e43ded820 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -77,9 +77,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); @@ -138,9 +138,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); - + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); } diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index 297b3c99a..cc3c37c8e 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -47,9 +47,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -74,9 +74,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); From 8df0289db61ea5a3e461c94c51a5798e2dd18b86 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Thu, 20 Jul 2023 15:28:35 +0100 Subject: [PATCH 019/191] Adds tail undisturbed for RVV Level 1 operations During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic. Commit changes intrinsics tail policy to undistrubed. --- kernel/riscv64/amax_rvv.c | 8 +++--- kernel/riscv64/amin_rvv.c | 8 +++--- kernel/riscv64/asum_rvv.c | 8 +++--- kernel/riscv64/dot_rvv.c | 16 +++++------ kernel/riscv64/iamax_rvv.c | 24 ++++++++-------- kernel/riscv64/iamin_rvv.c | 24 ++++++++-------- kernel/riscv64/imax_rvv.c | 24 ++++++++-------- kernel/riscv64/imin_rvv.c | 24 ++++++++-------- kernel/riscv64/izamax_rvv.c | 24 ++++++++-------- kernel/riscv64/izamin_rvv.c | 24 ++++++++-------- kernel/riscv64/max_rvv.c | 8 +++--- kernel/riscv64/min_rvv.c | 8 +++--- kernel/riscv64/nrm2_rvv.c | 8 +++--- kernel/riscv64/sum_rvv.c | 8 +++--- kernel/riscv64/zamax_rvv.c | 8 +++--- kernel/riscv64/zamin_rvv.c | 8 +++--- kernel/riscv64/zasum_rvv.c | 12 ++++---- kernel/riscv64/zdot_rvv.c | 56 ++++++++++++++++++------------------- kernel/riscv64/znrm2_rvv.c | 24 ++++++++-------- kernel/riscv64/zsum_rvv.c | 12 ++++---- 20 files changed, 168 insertions(+), 168 deletions(-) diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c index be0bdbea0..451fbc834 100644 --- a/kernel/riscv64/amax_rvv.c +++ b/kernel/riscv64/amax_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } else { @@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c index d4926084b..5186d7b12 100644 --- a/kernel/riscv64/amin_rvv.c +++ b/kernel/riscv64/amin_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } else { @@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c index 691591e22..0ea610cbb 100644 --- a/kernel/riscv64/asum_rvv.c +++ b/kernel/riscv64/asum_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -76,7 +76,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } else { @@ -88,7 +88,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c index 3276695b6..837badf41 100644 --- a/kernel/riscv64/dot_rvv.c +++ b/kernel/riscv64/dot_rvv.c @@ -49,12 +49,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } @@ -69,12 +69,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } else if (1 == inc_y) { @@ -88,12 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } else { @@ -108,12 +108,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index d3508a91d..8362d7cef 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -68,12 +68,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -106,11 +106,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } else { @@ -125,11 +125,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index ae1d4f726..f90dbb545 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -43,12 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -69,12 +69,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -107,11 +107,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } else { @@ -126,11 +126,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index 33250568d..b1a77b178 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } else { @@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index 4ce49c3af..1de7f3233 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } else { @@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e43ded820..e93f0056c 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -44,13 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -73,13 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -116,11 +116,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); } } else { @@ -138,11 +138,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); } } diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index cc3c37c8e..b5bc27404 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -43,13 +43,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,13 +70,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -113,11 +113,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); } } else { @@ -136,11 +136,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); } } diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c index 9315321f4..745c27bf4 100644 --- a/kernel/riscv64/max_rvv.c +++ b/kernel/riscv64/max_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } else { @@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c index 158b682fd..78528fef9 100644 --- a/kernel/riscv64/min_rvv.c +++ b/kernel/riscv64/min_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } else { @@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 42abfa119..994fadb70 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -79,7 +79,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLEV_FLOAT(x, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); } } else { @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(x, stride_x, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); } } diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c index 9715faf22..c5629197f 100644 --- a/kernel/riscv64/sum_rvv.c +++ b/kernel/riscv64/sum_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -73,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } else { @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index 615b7519c..bbb1e876b 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmax = VFMAXVV_FLOAT(vmax, v0, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); } @@ -101,7 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmax = VFMAXVV_FLOAT(vmax, v0, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); } } diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index a0d36d46f..c5453121b 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmin = VFMINVV_FLOAT(vmin, v0, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); } } else { @@ -100,7 +100,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmin = VFMINVV_FLOAT(vmin, v0, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); } } diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c index 1d2f0e1fe..ebec1b19c 100644 --- a/kernel/riscv64/zasum_rvv.c +++ b/kernel/riscv64/zasum_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -51,7 +51,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #endif @@ -75,8 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } @@ -93,8 +93,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index 1543c513d..fa0e89353 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -36,12 +36,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) @@ -52,12 +52,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -86,14 +86,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSEG_FLOAT(&vx0, &vx1, x, vl); VLSEG_FLOAT(&vy0, &vy1, y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } @@ -107,14 +107,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSEG_FLOAT(&vx0, &vx1, x, vl); VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } } else if (inc_y == 1){ @@ -127,14 +127,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); VLSEG_FLOAT(&vy0, &vy1, y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } }else { @@ -148,14 +148,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } } diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index 5f7873b5a..d2b27aa8d 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else @@ -51,10 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif @@ -85,11 +85,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); } } else { @@ -103,11 +103,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); } } diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index 44df112c6..b41f70eb5 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m4() @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m4_tu #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -69,8 +69,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) VLSEG_FLOAT(&v0, &v1, x, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } else { @@ -82,8 +82,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } From 826a9d5fa47f20f23f42c97385e72e121a2efb4f Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Tue, 25 Jul 2023 11:36:23 +0100 Subject: [PATCH 020/191] Adds tail undisturbed for RVV Level 2 operations During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic. Commit changes intrinsics tail policy to undistrubed. --- kernel/riscv64/gemv_t_rvv.c | 8 ++++---- kernel/riscv64/symv_L_rvv.c | 12 +++++------ kernel/riscv64/symv_U_rvv.c | 12 +++++------ kernel/riscv64/zgemv_t_rvv.c | 40 ++++++++++++++++++------------------ 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c index f0c834866..9c859aa50 100644 --- a/kernel/riscv64/gemv_t_rvv.c +++ b/kernel/riscv64/gemv_t_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO va = VLEV_FLOAT(a_ptr, vl); vx = VLEV_FLOAT(x_ptr, vl); - vr = VFMACCVV_FLOAT(vr, va, vx, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); @@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO va = VLEV_FLOAT(a_ptr, vl); vx = VLSEV_FLOAT(x_ptr, stride_x, vl); - vr = VFMACCVV_FLOAT(vr, va, vx, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index b27db2e37..888d628a5 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); iy += inc_yv; } @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; } @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; iy += inc_yv; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 7e45b1a01..3cfd3ee4c 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); iy += inc_yv; } @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; } @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; iy += inc_yv; } diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index 15795cc3a..2f0380530 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 @@ -90,15 +90,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); #endif j += vl * 2; ix += vl * inc_x * 2; @@ -134,15 +134,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); #endif j += vl * 2; ix += vl * inc_x * 2; From 4d0f000db63fedbe62ca318bb7bcd5d3152fc4a2 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 7 Aug 2023 16:55:59 +0800 Subject: [PATCH 021/191] MIPS: Enable MSA --- kernel/mips/KERNEL.P5600 | 34 ++++++++++----------- kernel/mips64/KERNEL.LOONGSON3R4 | 32 ++++++++++---------- param.h | 52 ++++++++++++++++---------------- 3 files changed, 59 insertions(+), 59 deletions(-) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a6e06d67..2e1e3390d 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -35,7 +35,7 @@ DSUMKERNEL = ../mips/sum.c CSUMKERNEL = ../mips/zsum.c ZSUMKERNEL = ../mips/zsum.c -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c @@ -47,7 +47,7 @@ CASUMKERNEL = ../mips/zasum.c ZASUMKERNEL = ../mips/zasum.c endif -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -59,7 +59,7 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c @@ -71,7 +71,7 @@ CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -88,7 +88,7 @@ DNRM2KERNEL = ../mips/nrm2.c CNRM2KERNEL = ../mips/znrm2.c ZNRM2KERNEL = ../mips/znrm2.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c @@ -100,7 +100,7 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c CSCALKERNEL = ../mips/cscal_msa.c @@ -112,7 +112,7 @@ CSCALKERNEL = ../mips/zscal.c ZSCALKERNEL = ../mips/zscal.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c @@ -124,7 +124,7 @@ CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c CGEMVNKERNEL = ../mips/cgemv_n_msa.c @@ -136,7 +136,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c ZGEMVNKERNEL = ../mips/zgemv_n.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVTKERNEL = ../mips/sgemv_t_msa.c DGEMVTKERNEL = ../mips/dgemv_t_msa.c CGEMVTKERNEL = ../mips/cgemv_t_msa.c @@ -148,7 +148,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -162,7 +162,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -180,7 +180,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -198,7 +198,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -212,7 +212,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -224,7 +224,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -236,7 +236,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -248,7 +248,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 index b81e5441d..1149d97f1 100644 --- a/kernel/mips64/KERNEL.LOONGSON3R4 +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -1,4 +1,4 @@ -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S DAXPYKERNEL = daxpy_loongson3a_simd.S endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c ZCOPYKERNEL = ../mips/zcopy_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -23,21 +23,21 @@ ZDOTKERNEL = ../mips/zdot_msa.c endif DSDOTKERNEL = ../mips/dot.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c ZROTKERNEL = ../mips/zrot_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c CSCALKERNEL = ../mips/cscal_msa.c ZSCALKERNEL = ../mips/zscal_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c SGEMVTKERNEL = ../mips/sgemv_t_msa.c @@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c endif -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c ZASUMKERNEL = ../mips/zasum_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c ZSWAPKERNEL = ../mips/zswap_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -89,7 +89,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -129,7 +129,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/param.h b/param.h index 547463b2f..a4672ce07 100644 --- a/param.h +++ b/param.h @@ -2748,19 +2748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL -#ifdef HAVE_MSA -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2772,6 +2760,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 64 @@ -2961,19 +2961,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#if defined(HAVE_MSA) -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2985,6 +2973,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 128 From 01c7010543d1abb5e99da0b97b731aff1acd5aa4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Korinsky" Date: Mon, 27 Nov 2023 14:28:08 +0000 Subject: [PATCH 022/191] cmake/openblas.pc.in: fixed version and URL --- cmake/openblas.pc.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 7e120af86..11e5606e5 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version -Version: @OPENBLAS_VERSION@ -URL: https://github.com/xianyi/OpenBLAS +Version: @OpenBLAS_VERSION@ +URL: https://github.com/OpenMathLib/OpenBLAS Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} From 89fa51d495560b88bd2057aae4782e2ee58ce706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Br=C3=A5then=20Solem?= Date: Sun, 3 Dec 2023 19:06:49 +0100 Subject: [PATCH 023/191] Revert 42b5e08 ("Allow weak linking on old macOS") --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7203088a0..6b02864ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,13 +258,13 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" - "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From 2381132ada5e97a576f47abff7f97f6d2a26266a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Br=C3=A5then=20Solem?= Date: Sun, 3 Dec 2023 19:13:53 +0100 Subject: [PATCH 024/191] Darwin < 20: always write xerbla.c.o into archive Write xerbla.c.o into archive regardless of timestamp by using ar -rs instead of ar -ru. --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b02864ca..f3eac2edf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,14 +256,14 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From e4586e81b896b85b600c50f9670e59989cbdabf7 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Mon, 4 Dec 2023 11:02:18 +0000 Subject: [PATCH 025/191] [RISC-V] Add RISC-V Vector 128-bit target Current RVV x280 target depends on vlen=512-bits for Level 3 operations. Commit adds generic target that supports vlen=128-bits. New target uses the same scalable kernels as x280 for Level 1&2 operations, and autogenerated kernels for Level 3 operations. Functional correctness of Level 3 operations tested on vlen=128-bits using QEMU v8.1.1 for ctests and BLAS-Tester. --- Makefile.prebuild | 4 + Makefile.riscv64 | 4 + TargetList.txt | 1 + cpuid_riscv64.c | 4 +- getarch.c | 13 +- kernel/riscv64/KERNEL.RISCV64_ZVL128B | 243 +++++ kernel/riscv64/cgemm_kernel_8x4_zvl128b.c | 996 +++++++++++++++++++ kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c | 1102 +++++++++++++++++++++ kernel/riscv64/dgemm_kernel_8x4_zvl128b.c | 492 +++++++++ kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c | 660 ++++++++++++ kernel/riscv64/sgemm_kernel_8x8_zvl128b.c | 791 +++++++++++++++ kernel/riscv64/strmm_kernel_8x8_zvl128b.c | 991 ++++++++++++++++++ kernel/riscv64/zgemm_kernel_4x4_zvl128b.c | 720 ++++++++++++++ kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c | 805 +++++++++++++++ param.h | 39 + 15 files changed, 6863 insertions(+), 2 deletions(-) create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL128B create mode 100644 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c create mode 100644 kernel/riscv64/strmm_kernel_8x8_zvl128b.c create mode 100644 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c create mode 100644 kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c diff --git a/Makefile.prebuild b/Makefile.prebuild index c4f4a2602..b56169da0 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif +ifeq ($(TARGET), RISCV64_ZVL128B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + ifeq ($(TARGET), RISCV64_GENERIC) TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce7a27141..93e270bde 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +ifeq ($(CORE), RISCV64_ZVL128B) +CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static diff --git a/TargetList.txt b/TargetList.txt index f76f605cc..5b7a63831 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -119,6 +119,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) +RISCV64_ZVL128B C910V x280 diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 1b6b62f21..15a539c20 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 #define CPU_RISCV64_ZVL256B 2 +#define CPU_RISCV64_ZVL128B 3 static char *cpuname[] = { "RISCV64_GENERIC", "C910V", - "CPU_RISCV64_ZVL256B" + "CPU_RISCV64_ZVL256B", + "CPU_RISCV64_ZVL128B" }; int detect(void){ diff --git a/getarch.c b/getarch.c index 772836347..b8b7ef7e0 100644 --- a/getarch.c +++ b/getarch.c @@ -1691,7 +1691,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "x280" #else #endif - +#ifdef FORCE_RISCV64_ZVL128B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL128B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL128B " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl128b" +#define CORENAME "RISCV64_ZVL128B" +#endif #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B new file mode 100644 index 000000000..fec69ee09 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -0,0 +1,243 @@ +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c +SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c + +DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c +DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c + +CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c +CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + +ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c +ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + +CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c +CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + +ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c +ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..bd615389c --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,996 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='cgemm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..3268cb810 --- /dev/null +++ b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,1102 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='ctrmm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..a613f0bce --- /dev/null +++ b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,492 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dgemm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..c1e0da86e --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,660 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dtrmm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c new file mode 100644 index 000000000..ad720e694 --- /dev/null +++ b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c @@ -0,0 +1,791 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='sgemm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + C[ci + 4 * ldc + 0] += alpha * result8; + C[ci + 4 * ldc + 1] += alpha * result9; + C[ci + 5 * ldc + 0] += alpha * result10; + C[ci + 5 * ldc + 1] += alpha * result11; + C[ci + 6 * ldc + 0] += alpha * result12; + C[ci + 6 * ldc + 1] += alpha * result13; + C[ci + 7 * ldc + 0] += alpha * result14; + C[ci + 7 * ldc + 1] += alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + C[ci + 4 * ldc + 0] += alpha * result4; + C[ci + 5 * ldc + 0] += alpha * result5; + C[ci + 6 * ldc + 0] += alpha * result6; + C[ci + 7 * ldc + 0] += alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/strmm_kernel_8x8_zvl128b.c b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c new file mode 100644 index 000000000..ef18f036c --- /dev/null +++ b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c @@ -0,0 +1,991 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='strmm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + C[ci + 4 * ldc + 0] = alpha * result8; + C[ci + 4 * ldc + 1] = alpha * result9; + C[ci + 5 * ldc + 0] = alpha * result10; + C[ci + 5 * ldc + 1] = alpha * result11; + C[ci + 6 * ldc + 0] = alpha * result12; + C[ci + 6 * ldc + 1] = alpha * result13; + C[ci + 7 * ldc + 0] = alpha * result14; + C[ci + 7 * ldc + 1] = alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + C[ci + 4 * ldc + 0] = alpha * result4; + C[ci + 5 * ldc + 0] = alpha * result5; + C[ci + 6 * ldc + 0] = alpha * result6; + C[ci + 7 * ldc + 0] = alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..0776f03fd --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c @@ -0,0 +1,720 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='zgemm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..d7d5e5fea --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c @@ -0,0 +1,805 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='ztrmm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/param.h b/param.h index c5c70b78e..a1a70400c 100644 --- a/param.h +++ b/param.h @@ -3123,6 +3123,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef RISCV64_ZVL128B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef RISCV64_ZVL256B #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 From 4a12cf53ec116c06e5d74073b54a3bca6046cb17 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Mon, 4 Dec 2023 11:13:35 +0000 Subject: [PATCH 026/191] [RISC-V] Improve RVV kernel generator LMUL usage The RVV kernel generation script uses the provided LMUL to increase the number of accumulator registers. Since the effect of the LMUL is to group together the vector registers into larger ones, it actually should be used as a multiplier in the calculation of vlenmax. At the moment, no matter what LMUL is provided, the generated kernels would only set the maximum number of vector elements equal to VLEN/SEW. Commit changes the use of LMUL to properly adjust vlenmax. Note that an increase in LMUL results in a decrease in the number of effective vector registers. --- kernel/riscv64/generate_kernel.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py index e2ce97971..8be7c9f9c 100755 --- a/kernel/riscv64/generate_kernel.py +++ b/kernel/riscv64/generate_kernel.py @@ -197,13 +197,13 @@ def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ): dest.write("ai += {M}*2;") dest.write() - - accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k", a_regs=a_regs*2, accumulation_regs=accumulation_regs*2 ) pass_regs = (accumulation_regs + a_regs)*2 - tmp_regs = 32-pass_regs + tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs if tmp_regs < 2: raise RuntimeError("Complex kernel would use too many registers!") @@ -337,10 +337,12 @@ def generate_gemm_kernel( settings, OUTPUT ): M = settings['M'].value N = settings['N'].value - vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value / + settings['ELEN_PARAM'].value) a_regs = max(int(M/vlenmax), 1) - accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N required_regs = accumulation_regs + a_regs if is_complex: required_regs = required_regs * 2 + 2 @@ -380,9 +382,9 @@ def generate_gemm_kernel( settings, OUTPUT ): '''.format(tail_policy=settings['tail_policy'].value)) - if required_regs > 32: - raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format( - required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '') + if required_regs > (32 // settings['LMUL_ACC'].value): + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value )) TRMM = (settings['op'].value == 'trmm') @@ -448,7 +450,8 @@ def generate_gemm_kernel( settings, OUTPUT ): def generate_M_tails( dest, settings, M, N ): M_tail = int(M/2) M_tail_min = settings['M_tail_scalar_from'].value - vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value + / settings['ELEN_PARAM'].value ) TRMM = (settings['op'].value == 'trmm') is_complex = settings['complex'].value generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real @@ -667,4 +670,4 @@ def main(): ERROR("unsupported kernel type {}".format(settings['op'])) if __name__ == "__main__": - main() \ No newline at end of file + main() From 4e738e561a95707e5ae41465af4896b74b2ca138 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 8 Dec 2023 12:36:08 -0600 Subject: [PATCH 027/191] Replace two vector loads with one vector pair load and fix endianess of stores. --- kernel/power/sgemm_tcopy_16_power8.S | 3 + kernel/power/sgemm_tcopy_macros_16_power10.S | 323 +++++++++++++++++++ kernel/power/sgemm_tcopy_macros_16_power8.S | 6 + 3 files changed, 332 insertions(+) create mode 100644 kernel/power/sgemm_tcopy_macros_16_power10.S diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index b9f6d63fb..6d2c6a555 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "sgemm_tcopy_macros_16_power10.S" +#endif #include "sgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..dca37e48a --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_16_power10.S @@ -0,0 +1,323 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs40, o0, T1 + stxvx vs41, o16, T1 + stxvx vs42, o32, T1 + stxvx vs43, o48, T1 +#else + stxvx vs41, o0, T1 + stxvx vs40, o16, T1 + stxvx vs43, o32, T1 + stxvx vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs44, o0, T1 + stxvx vs45, o16, T1 + stxvx vs46, o32, T1 + stxvx vs47, o48, T1 +#else + stxvx vs45, o0, T1 + stxvx vs44, o16, T1 + stxvx vs47, o32, T1 + stxvx vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + lxvpx vs36, o0, A2 + + lxvpx vs38, o0, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index ed592a604..af237d5ee 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -141,6 +142,7 @@ define(`COPY_4x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=4 and M=4 @@ -264,6 +266,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -329,6 +332,7 @@ define(`COPY_2x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=2 and M=4 @@ -418,6 +422,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -465,6 +470,7 @@ define(`COPY_1x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=1 and M=4 From c732f275a27cbd9044d8409c2dd13e1e32f675ca Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Mon, 11 Dec 2023 21:05:31 +0800 Subject: [PATCH 028/191] system_check.cmake: fix arch detection for Darwin PowerPC --- cmake/system_check.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index c9671b379..e94497a04 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64) endif() elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*")) set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) @@ -109,7 +109,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) + if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc"))) set(BINARY 64) else () set(BINARY 32) From 9dbc8129b3987fd038585904e612a8ff0f62f947 Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Mon, 11 Dec 2023 21:09:06 +0800 Subject: [PATCH 029/191] cpuid_power.c: add CPU_SUBTYPE_POWERPC_7400 case --- cpuid_power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_power.c b/cpuid_power.c index 2526e8d0e..1ced8930a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -160,6 +160,7 @@ int detect(void){ infoCount = HOST_BASIC_INFO_COUNT; host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; From 330101e0b38b95337818a27b29300f1637c1cce5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Dec 2023 21:52:00 +0100 Subject: [PATCH 030/191] Add complex type definitions for MSVC --- lapack-netlib/LAPACKE/include/lapack.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 28f8ad655..dac6b22be 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -37,7 +37,15 @@ */ #ifndef LAPACK_COMPLEX_CUSTOM +#if defined(_MSC_VER) + #define _CRT_USE_C_COMPLEX_H + #include + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex +#endif +#else /* Complex type (single precision) */ #ifndef lapack_complex_float #ifndef __cplusplus @@ -74,6 +82,7 @@ #define lapack_complex_double_imag(z) (cimag(z)) #endif +#endif #endif /* LAPACK_COMPLEX_CUSTOM */ From dcf6999c4e3b6af70d2822b9c2629e2df91378e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Dec 2023 11:27:17 +0100 Subject: [PATCH 031/191] remove extraneous endif --- lapack-netlib/LAPACKE/include/lapack.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index dac6b22be..b9f81792d 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -43,7 +43,6 @@ #define LAPACK_COMPLEX_CUSTOM #define lapack_complex_float _Fcomplex #define lapack_complex_double _Dcomplex -#endif #else /* Complex type (single precision) */ From dcdc35127276b4a261292bf03ca8570e89ee105a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Dec 2023 23:06:22 +0100 Subject: [PATCH 032/191] Add MSVC-compatible complex types --- lapack-netlib/LAPACKE/include/lapacke_config.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index c64fc4416..798a5eb2e 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -68,6 +68,17 @@ extern "C" { #endif #ifndef LAPACK_COMPLEX_CUSTOM +#if defined(_MSC_VER) + #define _CRT_USE_C_COMPLEX_H + #include + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex + #define lapack_complex_float_real(z) (creal(z)) + #define lapack_complex_float_imag(z) (cimag(z)) + #define lapack_complex_double_real(z) (creal(z)) + #define lapack_complex_double_imag(z) (cimag(z)) +#else #if defined(LAPACK_COMPLEX_STRUCTURE) @@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double; #define lapack_complex_double_real(z) (creal(z)) #define lapack_complex_double_imag(z) (cimag(z)) +#endif #endif lapack_complex_float lapack_make_complex_float( float re, float im ); From aa46f1e4e721550a13524ee890c0d26077c0a9eb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Dec 2023 23:07:48 +0100 Subject: [PATCH 033/191] revert addition of MSVC-compatible complex (moved to lapacke_config.h) --- lapack-netlib/LAPACKE/include/lapack.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index b9f81792d..28f8ad655 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -37,14 +37,7 @@ */ #ifndef LAPACK_COMPLEX_CUSTOM -#if defined(_MSC_VER) - #define _CRT_USE_C_COMPLEX_H - #include - #define LAPACK_COMPLEX_CUSTOM - #define lapack_complex_float _Fcomplex - #define lapack_complex_double _Dcomplex -#else /* Complex type (single precision) */ #ifndef lapack_complex_float #ifndef __cplusplus @@ -81,7 +74,6 @@ #define lapack_complex_double_imag(z) (cimag(z)) #endif -#endif #endif /* LAPACK_COMPLEX_CUSTOM */ From a8d3619f65e65cb5a7e24f148a4924339a1a702f Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Wed, 13 Dec 2023 19:42:56 +0800 Subject: [PATCH 034/191] cc.cmake: add optflags for G5 and G4 kernels --- cmake/cc.cmake | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 7b4ef8947..ac8661a7b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -282,6 +282,18 @@ if (${CORE} STREQUAL POWER8) endif () endif () +if (${CORE} STREQUAL PPC970) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math") + endif () +endif () + +if (${CORE} STREQUAL PPCG4) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math") + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") From d9653af01851f52470f7b65d8650f6f2f5431b6e Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Wed, 13 Dec 2023 19:23:50 +0800 Subject: [PATCH 035/191] KERNEL.PPC970, KERNEL.PPCG4: unbreak CMake parsing Fixes: https://github.com/OpenMathLib/OpenBLAS/issues/4366 --- kernel/power/KERNEL.PPC970 | 8 ++++---- kernel/power/KERNEL.PPCG4 | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index a99fb7d96..fee5fa529 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,11 +1,11 @@ ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMMKERNEL = gemm_kernel.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) else diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 1bdd3119e..c73601cee 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -96,9 +96,9 @@ CGEMMINCOPY = CGEMMONCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = +CGEMMINCOPYOBJ = #cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = #cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) From 981e315b30bb50cbb2b7375665f0f11d0d567703 Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Thu, 14 Dec 2023 12:01:31 +0800 Subject: [PATCH 036/191] cc.cmake: use -force_cpusubtype_ALL for Darwin PPC --- cmake/cc.cmake | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index ac8661a7b..d5bf3b7ee 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -286,12 +286,18 @@ if (${CORE} STREQUAL PPC970) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math") endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () endif () if (${CORE} STREQUAL PPCG4) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math") endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () endif () if (NOT DYNAMIC_ARCH) From f06b5355667434f29258c3664bc5ea517988040b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Dec 2023 09:58:44 +0100 Subject: [PATCH 037/191] Use C kernel for dgemv_t due to limitations of the old assembly one --- kernel/x86_64/KERNEL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index bea7036c2..f8278c3b4 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S endif ifndef DGEMVTKERNEL -DGEMVTKERNEL = dgemv_t.S +DGEMVTKERNEL = dgemv_t_4.c endif ifndef CGEMVNKERNEL From 544cb8630049650cec9bf4c582fda43f334d5c59 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Dec 2023 14:03:59 +0100 Subject: [PATCH 038/191] Mention C906V instruction set limitation and update DYNAMIC_ARCH lists --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index aaadd0d9c..b8d66ed42 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` - (also known to work on C906) + (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) ### Support for multiple targets in a single library @@ -207,9 +207,11 @@ For **x86_64**, the list of targets this activates contains Prescott, Core2, Neh `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. -On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. +On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled. -For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. +For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build. + +on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. From 8c143331b0216809343d056b084e29a0ab3311a7 Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Fri, 15 Dec 2023 22:55:52 +0800 Subject: [PATCH 039/191] PPC970: drop -mcpu=970 which seems to produce faulty code Fixes: https://github.com/OpenMathLib/OpenBLAS/issues/4376 --- cmake/cc.cmake | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index d5bf3b7ee..00952e810 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -282,18 +282,21 @@ if (${CORE} STREQUAL POWER8) endif () endif () +# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone +# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case. if (${CORE} STREQUAL PPC970) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math") + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math") endif () if (APPLE) set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") endif () endif () +# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency? if (${CORE} STREQUAL PPCG4) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math") + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math") endif () if (APPLE) set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") From dab0da8243ce03d2a8d76e614259c099bf87b819 Mon Sep 17 00:00:00 2001 From: Darshan Patel Date: Tue, 19 Dec 2023 13:56:55 +0530 Subject: [PATCH 040/191] Update GEMM param for NEOVERSEV1 --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index ee4640f57..1ec2d16dd 100644 --- a/param.h +++ b/param.h @@ -3396,13 +3396,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_MN 16 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 From 7a4fef4f604db2a5e4e0c4ffaebea220a0646ab1 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Fri, 15 Dec 2023 12:50:48 +0000 Subject: [PATCH 041/191] Tweak SVE dot kernel This changes the SVE dot kernel to only predicate when necessary as well as streamlining the assembly a bit. The benchmarks seem to indicate this can improve performance by ~33%. --- kernel/arm64/dot_kernel_sve.c | 100 +++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 9c057551e..133de4ab3 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -1,4 +1,5 @@ /*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() +#define DTYPE "d" +#define WIDTH "d" +#define SHIFT "3" #else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() +#define DTYPE "s" +#define WIDTH "w" +#define SHIFT "2" #endif -static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { - SVE_TYPE acc_a = SVE_ZERO; - SVE_TYPE acc_b = SVE_ZERO; +#define COUNT \ +" cnt"WIDTH" x9 \n" +#define SETUP_TRUE \ +" ptrue p0."DTYPE" \n" +#define OFFSET_INPUTS \ +" add x12, %[X_], x9, lsl #"SHIFT" \n" \ +" add x13, %[Y_], x9, lsl #"SHIFT" \n" +#define TAIL_WHILE \ +" whilelo p1."DTYPE", x8, x0 \n" +#define UPDATE(pg, x,y,out) \ +" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \ +" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \ +" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n" +#define SUM_VECTOR(v) \ +" faddv "DTYPE""v", p0, z"v"."DTYPE" \n" +#define RET \ +" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n" - BLASLONG sve_width = SVE_WIDTH; +#define DOT_KERNEL \ + COUNT \ +" mov z1.d, #0 \n" \ +" mov z0.d, #0 \n" \ +" mov x8, #0 \n" \ +" movi d1, #0x0 \n" \ + SETUP_TRUE \ +" neg x10, x9, lsl #1 \n" \ +" ands x11, x10, x0 \n" \ +" b.eq .Lskip_2x \n" \ + OFFSET_INPUTS \ +".Lvector_2x: \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ + UPDATE("p0", "x12", "x13", "z0") \ +" sub x8, x8, x10 \n" \ +" cmp x8, x11 \n" \ +" b.lo .Lvector_2x \n" \ + SUM_VECTOR("1") \ +".Lskip_2x: \n" \ +" neg x10, x9 \n" \ +" and x10, x10, x0 \n" \ +" cmp x8, x10 \n" \ +" b.hs .Ltail \n" \ +".Lvector_1x: \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ +" add x8, x8, x9 \n" \ +" cmp x8, x10 \n" \ +" b.lo .Lvector_1x \n" \ +".Ltail: \n" \ +" cmp x10, x0 \n" \ +" b.eq .Lend \n" \ + TAIL_WHILE \ + UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ +".Lend: \n" \ + SUM_VECTOR("0") \ + RET - for (BLASLONG i = 0; i < n; i += sve_width * 2) { - svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); - svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); +static +FLOAT +dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) +{ + FLOAT ret; - SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); - SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); - SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); - SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + asm(DOT_KERNEL + : + [RET_] "=&w" (ret) + : + [N_] "r" (n), + [X_] "r" (x), + [Y_] "r" (y) + :); - acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); - acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); - } - - return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); + return ret; } From 60e66725e40ef5c6b5a8b2214c3b3a633f699c9b Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 19 Dec 2023 13:11:06 +0000 Subject: [PATCH 042/191] Use numeric labels to allow repeated inlining --- kernel/arm64/dot_kernel_sve.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 133de4ab3..16f4cd537 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -67,31 +67,31 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SETUP_TRUE \ " neg x10, x9, lsl #1 \n" \ " ands x11, x10, x0 \n" \ -" b.eq .Lskip_2x \n" \ +" b.eq 2f // skip_2x \n" \ OFFSET_INPUTS \ -".Lvector_2x: \n" \ +"1: // vector_2x \n" \ UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ UPDATE("p0", "x12", "x13", "z0") \ " sub x8, x8, x10 \n" \ " cmp x8, x11 \n" \ -" b.lo .Lvector_2x \n" \ +" b.lo 1b // vector_2x \n" \ SUM_VECTOR("1") \ -".Lskip_2x: \n" \ +"2: // skip_2x \n" \ " neg x10, x9 \n" \ " and x10, x10, x0 \n" \ " cmp x8, x10 \n" \ -" b.hs .Ltail \n" \ -".Lvector_1x: \n" \ +" b.hs 4f // tail \n" \ +"3: // vector_1x \n" \ UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ " add x8, x8, x9 \n" \ " cmp x8, x10 \n" \ -" b.lo .Lvector_1x \n" \ -".Ltail: \n" \ +" b.lo 3b // vector_1x \n" \ +"4: // tail \n" \ " cmp x10, x0 \n" \ -" b.eq .Lend \n" \ +" b.eq 5f // end \n" \ TAIL_WHILE \ UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ -".Lend: \n" \ +"5: // end \n" \ SUM_VECTOR("0") \ RET From bb8b91e9f2c8176e7a6388fe7a949744771bb29d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Dec 2023 23:13:02 +0100 Subject: [PATCH 043/191] restore OpenBLAS-specific test paths --- lapack-netlib/lapack_testing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 96fbeb2a6..fea0ff765 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -255,19 +255,19 @@ for dtype in range_prec: else: if dtest==16: # LIN TESTS - cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==17: # PROTO LIN TESTS - cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==18: # PROTO LIN TESTS - cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==20: # DMD EIG TESTS - cmdbase="xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS - cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" if not just_errors and not short_summary: print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing From e67a0eaaf9f02e37270ac03747b333b28dac61e7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Dec 2023 23:15:11 +0100 Subject: [PATCH 044/191] Restore OpenBLAS-specific build rule changes --- lapack-netlib/TESTING/EIG/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 4e7cf4629..9cf0fc95e 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -136,28 +136,28 @@ double: xeigtstd xdmdeigtstd complex16: xeigtstz xdmdeigtstz xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(SDMDEIGTST): $(FRC) $(CDMDEIGTST): $(FRC) From a7ed60bfe93a73df853be560244aa33a774088ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 21 Dec 2023 20:05:23 +0100 Subject: [PATCH 045/191] Add lower limit for multithreading --- interface/lapack/gesv.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 175350329..546c2bed2 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif From ecae1389dff8bcb6643685aedb1fbaeb9b26d79d Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Sat, 23 Dec 2023 12:21:48 +0000 Subject: [PATCH 046/191] Reduce duplication in kernel definitions These files are exactly the same, so I believe we can reduce these files down. Other files require a slightly more complex unpicking. --- kernel/arm64/KERNEL.A64FX | 207 +----------------------------- kernel/arm64/KERNEL.CORTEXA55 | 197 +--------------------------- kernel/arm64/KERNEL.THUNDERX3T110 | 185 +------------------------- 3 files changed, 3 insertions(+), 586 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ccbce27e1..bc5999097 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1,206 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index e2e006770..574e98b8c 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -1,196 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -else -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -endif -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +include $(KERNELDIR)/KERNEL.CORTEXA53 diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index 41cedc851..5d3bd69f7 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -1,184 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) -DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) -SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S -endif - -ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) -CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) -ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S -endif +include $(KERNELDIR)/KERNEL.THUNDERX2T99 From dc20a7818899cd6fbcf192b93e1940b353f83a0f Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Sat, 23 Dec 2023 12:19:33 +0000 Subject: [PATCH 047/191] Use functionally equivalent dynamic targets Similar to `drivers/other/dynamic.c`, I've looked for functionally equivalent targets and mapped them in the default DYNAMIC_ARCH build. Users can still build specific cores using DYNAMIC_LIST. --- Makefile.system | 4 ---- driver/others/dynamic_arm64.c | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/Makefile.system b/Makefile.system index ff06e503c..e602eaf05 100644 --- a/Makefile.system +++ b/Makefile.system @@ -677,16 +677,12 @@ ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 -DYNAMIC_CORE += CORTEXA72 -DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE endif -DYNAMIC_CORE += CORTEXA55 -DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 530d18115..b5fb8161d 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -122,10 +122,11 @@ extern gotoblas_t gotoblas_CORTEXA55; #endif #else extern gotoblas_t gotoblas_CORTEXA53; +#define gotoblas_CORTEXA55 gotoblas_CORTEXA53 extern gotoblas_t gotoblas_CORTEXA57; -extern gotoblas_t gotoblas_CORTEXA72; -extern gotoblas_t gotoblas_CORTEXA73; -extern gotoblas_t gotoblas_FALKOR; +#define gotoblas_CORTEXA72 gotoblas_CORTEXA57 +#define gotoblas_CORTEXA73 gotoblas_CORTEXA57 +#define gotoblas_FALKOR gotoblas_CORTEXA57 extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; @@ -141,7 +142,6 @@ extern gotoblas_t gotoblas_ARMV8SVE; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; -extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); From 8ce44c18a0e81527f7e9db98ee6276af4a408b8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:24:10 +0100 Subject: [PATCH 048/191] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/TESTING/EIG/cerrst.f | 12 ++++++------ lapack-netlib/TESTING/EIG/chkxer.f | 2 +- lapack-netlib/TESTING/EIG/derrst.f | 4 ++-- lapack-netlib/TESTING/EIG/serrst.f | 4 ++-- lapack-netlib/TESTING/EIG/zerrst.f | 12 ++++++------ 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index 1748a2aad..d23eb14ea 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f index fd00bb65a..70caf7e0a 100644 --- a/lapack-netlib/TESTING/EIG/chkxer.f +++ b/lapack-netlib/TESTING/EIG/chkxer.f @@ -61,7 +61,7 @@ RETURN * 9999 FORMAT( ' *** Illegal value of parameter number ', I2, - $ ' not detected by ', A6, ' ***' ) + $ ' not detected by ', A, ' ***' ) * * End of CHKXER * diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index 059538644..7d111e2e0 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -735,12 +735,12 @@ CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index b87fc42ef..408346382 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -733,12 +733,12 @@ CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index d7b41c053..31881c4de 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 From 5c11b2ff41607ef923593ff73f0317aa93be30a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:27:20 +0100 Subject: [PATCH 049/191] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f | 7 +++---- lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f | 4 ++-- lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f | 5 ++--- lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f | 4 ++-- lapack-netlib/TESTING/LIN/schksy_aa_2stage.f | 5 ++--- lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f | 4 ++-- lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f | 11 ++++++----- lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f | 4 ++-- 8 files changed, 21 insertions(+), 23 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f index 30a61261f..d79978e55 100644 --- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f @@ -433,9 +433,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'CHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) + CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -517,7 +517,6 @@ c NT = 1 CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'CHETRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 51cef512d..83e8a17b0 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using CHESV_AA. * SRNAMT = 'CHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index bc4e77a5a..1940351a4 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -421,9 +421,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'DSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -503,7 +503,6 @@ c NT = 1 CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'DSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 91c9e8e9a..d04106ae3 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using DSYSV_AA. * SRNAMT = 'DSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f index d3c27ae56..6490cd7c3 100644 --- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f @@ -423,9 +423,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'SSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -505,7 +505,6 @@ CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'SSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index aff32bce9..319b90805 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using SSYSV_AA. * SRNAMT = 'SSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f index 381fac9f2..51082f1d0 100644 --- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f @@ -8,7 +8,7 @@ * Definition: * =========== * -* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, +* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, * NNS, NSVAL, THRESH, TSTERR, NMAX, A, * AFAC, AINV, B, X, XACT, WORK, RWORK, * IWORK, NOUT ) @@ -185,7 +185,8 @@ LOGICAL DOTYPE( * ) INTEGER IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * ) COMPLEX*16 A( * ), AFAC( * ), AINV( * ), B( * ), - $ RWORK( * ), WORK( * ), X( * ), XACT( * ) + $ WORK( * ), X( * ), XACT( * ) + DOUBLE PRECISION RWORK( * ) * .. * * ===================================================================== @@ -430,9 +431,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'ZHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) + CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f index 9401867e0..fcd774491 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using ZHESV_AA. * SRNAMT = 'ZHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * From 0814491d968da09eebee620fe573750a4b703944 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:37:03 +0100 Subject: [PATCH 050/191] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/cgebrd.f | 32 ++++++---- lapack-netlib/SRC/cgehrd.f | 20 +++--- lapack-netlib/SRC/cgelq.f | 8 +-- lapack-netlib/SRC/cgelqf.f | 20 +++--- lapack-netlib/SRC/cgemlq.f | 35 +++++++---- lapack-netlib/SRC/cgemqr.f | 31 ++++++--- lapack-netlib/SRC/cgeqlf.f | 12 ++-- lapack-netlib/SRC/cgeqp3rk.f | 24 +++---- lapack-netlib/SRC/cgeqr.f | 23 ++++--- lapack-netlib/SRC/cgeqrfp.f | 26 +++++--- lapack-netlib/SRC/cgesvdx.f | 14 ++--- lapack-netlib/SRC/cgesvj.f | 70 ++++++++++++--------- lapack-netlib/SRC/cgetri.f | 6 +- lapack-netlib/SRC/cgetsls.f | 7 ++- lapack-netlib/SRC/cgetsqrhrt.f | 27 +++++--- lapack-netlib/SRC/cgges3.f | 40 +++++++----- lapack-netlib/SRC/cggev3.f | 28 +++++---- lapack-netlib/SRC/cgghd3.f | 22 ++++--- lapack-netlib/SRC/cggqrf.f | 6 +- lapack-netlib/SRC/cggrqf.f | 6 +- lapack-netlib/SRC/cggsvd3.f | 4 +- lapack-netlib/SRC/cggsvp3.f | 4 +- lapack-netlib/SRC/cheevd.f | 11 ++-- lapack-netlib/SRC/cheevr.f | 31 +++++---- lapack-netlib/SRC/cheevr_2stage.f | 57 ++++++++++------- lapack-netlib/SRC/cheevx.f | 6 +- lapack-netlib/SRC/chesv_aa.f | 15 ++--- lapack-netlib/SRC/chesv_aa_2stage.f | 24 +++---- lapack-netlib/SRC/chesvx.f | 11 ++-- lapack-netlib/SRC/chetrd_2stage.f | 94 +++++++++++++++------------- lapack-netlib/SRC/chetrd_hb2st.F | 40 +++++++----- lapack-netlib/SRC/chetrd_he2hb.f | 20 +++--- lapack-netlib/SRC/chetrf.f | 8 +-- lapack-netlib/SRC/chetrf_aa.f | 29 ++++++--- lapack-netlib/SRC/chetrf_aa_2stage.f | 25 ++++---- lapack-netlib/SRC/chetrf_rk.f | 10 +-- lapack-netlib/SRC/chetrf_rook.f | 6 +- lapack-netlib/SRC/chetri2.f | 30 +++++---- lapack-netlib/SRC/chetri_3.f | 21 ++++--- lapack-netlib/SRC/chetrs_aa.f | 27 +++++--- lapack-netlib/SRC/clamswlq.f | 66 +++++++++++-------- lapack-netlib/SRC/clamtsqr.f | 74 ++++++++++++---------- lapack-netlib/SRC/claswlq.f | 80 +++++++++++++---------- lapack-netlib/SRC/clatrs3.f | 32 +++++++--- lapack-netlib/SRC/clatsqr.f | 89 +++++++++++++++----------- lapack-netlib/SRC/dsytrf.f | 5 +- lapack-netlib/SRC/ssytrd_sb2st.F | 40 +++++++----- 47 files changed, 783 insertions(+), 533 deletions(-) diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f index 5687161a5..5920b1cf5 100644 --- a/lapack-netlib/SRC/cgebrd.f +++ b/lapack-netlib/SRC/cgebrd.f @@ -123,7 +123,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -148,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -225,8 +226,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL CGEBD2, CGEMM, CLABRD, XERBLA @@ -236,16 +237,24 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = REAL( LWKOPT ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -253,7 +262,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -265,7 +274,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -284,7 +292,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -343,7 +351,7 @@ * CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = WS + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of CGEBRD diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f index f407f931a..7ba87cc01 100644 --- a/lapack-netlib/SRC/cgehrd.f +++ b/lapack-netlib/SRC/cgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index ff482bc42..24aaa982e 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f index 75f5bc960..3847a958a 100644 --- a/lapack-netlib/SRC/cgelqf.f +++ b/lapack-netlib/SRC/cgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGELQF diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f index e0cf78bc0..e5b02b669 100644 --- a/lapack-netlib/SRC/cgemlq.f +++ b/lapack-netlib/SRC/cgemlq.f @@ -110,16 +110,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -143,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -159,11 +160,13 @@ *> block sizes MB and NB returned by ILAENV, CGELQ will use either *> CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute *> the LQ factorization. -*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to +*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in CLAMSWLQ or CGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -185,11 +188,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAMSWLQ, CGEMLQT, XERBLA @@ -201,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -216,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -244,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -261,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -274,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f index ea9de146e..0b7dd9dd7 100644 --- a/lapack-netlib/SRC/cgemqr.f +++ b/lapack-netlib/SRC/cgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMQRT, CLAMTSQR, XERBLA @@ -203,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -251,7 +262,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f index 918bbddad..6c67344c5 100644 --- a/lapack-netlib/SRC/cgeqlf.f +++ b/lapack-netlib/SRC/cgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -187,10 +188,11 @@ NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * @@ -277,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQLF diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f index 587860684..731c44edb 100644 --- a/lapack-netlib/SRC/cgeqp3rk.f +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -428,7 +428,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= N+NRHS-1 +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. *> For optimal performance LWORK >= NB*( N+NRHS+1 ), *> where NB is the optimal block size for CGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. @@ -627,8 +628,9 @@ * .. External Functions .. LOGICAL SISNAN INTEGER ISAMAX, ILAENV - REAL SLAMCH, SCNRM2 - EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV + REAL SLAMCH, SCNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN @@ -703,7 +705,7 @@ * LWKOPT = 2*N + NB*( N+NRHS+1 ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -726,7 +728,7 @@ K = 0 MAXC2NRMK = ZERO RELMAXC2NRMK = ZERO - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -778,7 +780,7 @@ * * Array TAU is not set and contains undefined elements. * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -797,7 +799,7 @@ TAU( J ) = CZERO END DO * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * END IF @@ -828,7 +830,7 @@ DO J = 1, MINMN TAU( J ) = CZERO END DO - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -873,7 +875,7 @@ TAU( J ) = CZERO END DO * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -991,7 +993,7 @@ * * Return from the routine. * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * @@ -1082,7 +1084,7 @@ * END IF * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f index d10e3da65..3617594d0 100644 --- a/lapack-netlib/SRC/cgeqr.f +++ b/lapack-netlib/SRC/cgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATSQR, CGEQRT, XERBLA @@ -244,8 +247,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +258,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +273,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +287,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +314,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f index eaf98ddf3..5b6226c67 100644 --- a/lapack-netlib/SRC/cgeqrfp.f +++ b/lapack-netlib/SRC/cgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL CGEQR2P, CLARFB, CLARFT, XERBLA @@ -182,8 +183,16 @@ * INFO = 0 NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +200,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -203,7 +212,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -211,7 +219,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +281,7 @@ $ CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQRFP diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f index fbdb121ca..e1856a65f 100644 --- a/lapack-netlib/SRC/cgesvdx.f +++ b/lapack-netlib/SRC/cgesvdx.f @@ -208,7 +208,7 @@ *> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (MAX(1,LWORK)) -*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK; +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup gesvdx * * ===================================================================== SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, @@ -312,8 +312,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -448,7 +448,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -19 @@ -464,7 +464,7 @@ * * Quick return if possible * - IF( M.EQ.0 .OR. N.EQ.0 ) THEN + IF( MINMN.EQ.0 ) THEN RETURN END IF * @@ -846,7 +846,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * RETURN * diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f index 149cf5e48..b9c8f1709 100644 --- a/lapack-netlib/SRC/cgesvj.f +++ b/lapack-netlib/SRC/cgesvj.f @@ -208,15 +208,17 @@ *> \verbatim *> CWORK is COMPLEX array, dimension (max(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -374,16 +378,17 @@ PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX AAPQ, OMPQ - REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX AAPQ, OMPQ + REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND, + $ MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, + $ RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -398,8 +403,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -422,7 +427,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M + N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +456,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +468,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f index 2060d1444..2eb3da7ab 100644 --- a/lapack-netlib/SRC/cgetri.f +++ b/lapack-netlib/SRC/cgetri.f @@ -153,8 +153,8 @@ * INFO = 0 NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -252,7 +252,7 @@ $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGETRI diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index b4bb7562f..3f43dc8de 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF ( M.GE.N ) THEN CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f index 4e4dc1d4a..087e9bc7f 100644 --- a/lapack-netlib/SRC/cgetsqrhrt.f +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpexOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -200,6 +202,10 @@ INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, $ XERBLA @@ -212,7 +218,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +231,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +269,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -277,14 +284,14 @@ CALL XERBLA( 'CGETSQRHRT', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -341,9 +348,9 @@ END IF END DO * - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of CGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f index aac9f9510..c1ca79688 100644 --- a/lapack-netlib/SRC/cgges3.f +++ b/lapack-netlib/SRC/cgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -310,13 +312,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,29 +380,33 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) END IF CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) END IF * @@ -422,7 +429,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -585,7 +591,7 @@ * 30 CONTINUE * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f index 9483ecdeb..d2b75aebc 100644 --- a/lapack-netlib/SRC/cggev3.f +++ b/lapack-netlib/SRC/cggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKOPT, LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX X @@ -253,13 +254,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( N, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = SLAMCH( 'E' )*SLAMCH( 'B' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -549,7 +553,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CGGEV3 diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 1074b4828..f7175a72c 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -180,14 +180,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -212,7 +212,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -265,7 +265,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM, @@ -280,8 +281,13 @@ * INFO = 0 NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) - WORK( 1 ) = CMPLX( LWKOPT ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) INITZ = LSAME( COMPZ, 'I' ) @@ -330,7 +336,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -888,7 +893,8 @@ IF ( JCOL.LT.IHI ) $ CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) - WORK( 1 ) = CMPLX( LWKOPT ) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index 29b0bf4af..309f170e8 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -251,8 +251,8 @@ NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index 273ab3ef7..8470a1ce2 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -250,8 +250,8 @@ NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * QR factorization of P-by-N matrix B: B = Z*T * CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f index f248aebd5..4c4b85bae 100644 --- a/lapack-netlib/SRC/cggsvd3.f +++ b/lapack-netlib/SRC/cggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -333,7 +333,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f index 008a053a2..e19f7efd5 100644 --- a/lapack-netlib/SRC/cggsvp3.f +++ b/lapack-netlib/SRC/cggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index b5ca804eb..9b62a2df6 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is REAL array, -*> dimension (LRWORK) +*> RWORK is REAL array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -282,8 +281,8 @@ LROPT = LRWMIN LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -378,8 +377,8 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f index 05c5e66be..ad5c8cd4a 100644 --- a/lapack-netlib/SRC/cheevr.f +++ b/lapack-netlib/SRC/cheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for CHETRD and for *> CUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,8 +463,8 @@ NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -710,8 +719,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f index 0332a09bc..e06925fcd 100644 --- a/lapack-netlib/SRC/cheevr_2stage.f +++ b/lapack-netlib/SRC/cheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by CUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -443,8 +446,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ILAENV2STAGE - REAL SLAMCH, CLANSY - EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE + REAL SLAMCH, CLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -472,9 +476,16 @@ IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -506,8 +517,8 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -535,7 +546,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -643,9 +654,9 @@ * * Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired @@ -666,7 +677,7 @@ CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 ) * - IF (ABSTOL .LE. TWO*N*EPS) THEN + IF ( ABSTOL .LE. TWO*N*EPS ) THEN TRYRAC = .TRUE. ELSE TRYRAC = .FALSE. @@ -765,8 +776,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f index e91599a44..a8a2bde63 100644 --- a/lapack-netlib/SRC/cheevx.f +++ b/lapack-netlib/SRC/cheevx.f @@ -348,14 +348,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 2*N NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) - LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = ( NB + 1 )*N END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 53ecc0a16..0f41c9332 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -197,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -207,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -240,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 12950c4af..05ebd9253 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -99,14 +99,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -146,14 +146,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -203,7 +204,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -225,6 +226,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -233,18 +235,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -254,7 +257,6 @@ RETURN END IF * -* * Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -268,7 +270,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f index c23a35ce7..bdaad55ec 100644 --- a/lapack-netlib/SRC/chesvx.f +++ b/lapack-netlib/SRC/chesvx.f @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,17 +347,17 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +406,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f index f5ad35f27..ec7075798 100644 --- a/lapack-netlib/SRC/chetrd_2stage.f +++ b/lapack-netlib/SRC/chetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download CHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download CHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX array, dimension (LHOUS2) +*> HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2=-1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,13 +145,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -157,9 +162,9 @@ *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -250,7 +255,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -265,10 +271,13 @@ * KD = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -285,8 +294,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -309,14 +318,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN END IF - CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HB2ST', -INFO ) @@ -324,8 +333,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_2STAGE diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 3688e40a3..b0d3e45fb 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension LWORK. +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -262,7 +267,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SICEV, SIZETAU, LDV, LHMIN, LWMIN @@ -286,7 +291,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -295,9 +299,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -318,8 +327,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -575,8 +584,7 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HB2ST diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index 090f02100..42e71e0b2 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HE2HB diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f index 0c596ffe7..2836e30bc 100644 --- a/lapack-netlib/SRC/chetrf.f +++ b/lapack-netlib/SRC/chetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -228,8 +228,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -347,7 +347,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index 0547a4eab..51410a6ed 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= 2*N. For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX ALPHA * .. @@ -179,19 +181,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = REAL( A( 1, 1 ) ) RETURN END IF @@ -460,7 +469,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_AA diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 400efdf26..a79343753 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -188,7 +188,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. @@ -213,9 +214,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -229,10 +230,10 @@ NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -241,7 +242,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f index ef442c937..a13c740e3 100644 --- a/lapack-netlib/SRC/chetrf_rk.f +++ b/lapack-netlib/SRC/chetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -311,8 +311,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_RK diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f index 1593c2edc..df0323520 100644 --- a/lapack-netlib/SRC/chetrf_rook.f +++ b/lapack-netlib/SRC/chetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -264,7 +264,7 @@ * NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -387,7 +387,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_ROOK diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f index 2865a6440..f15065ae7 100644 --- a/lapack-netlib/SRC/chetri2.f +++ b/lapack-netlib/SRC/chetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRI2X, CHETRI, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of CHETRI2 diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f index deda63598..ccfce5070 100644 --- a/lapack-netlib/SRC/chetri_3.f +++ b/lapack-netlib/SRC/chetri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'CHETRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f index 879549106..07179ab92 100644 --- a/lapack-netlib/SRC/chetrs_aa.f +++ b/lapack-netlib/SRC/chetrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -151,24 +157,30 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME,SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f index 5daf60bf6..8f474a3ab 100644 --- a/lapack-netlib/SRC/clamswlq.f +++ b/lapack-netlib/SRC/clamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -193,91 +196,100 @@ *> * ===================================================================== SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CTPMLQT, CGEMLQT, XERBLA + EXTERNAL CTPMLQT, CGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMSWLQ', -INFO ) - WORK(1) = SROUNDUP_LWORK(LW) RETURN - ELSE IF (LQUERY) THEN - WORK(1) = SROUNDUP_LWORK(LW) + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -404,7 +416,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMSWLQ diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f index 05021e642..13625087f 100644 --- a/lapack-netlib/SRC/clamtsqr.f +++ b/lapack-netlib/SRC/clamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -195,45 +197,47 @@ *> * ===================================================================== SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CGEMQRT, CTPMQRT, XERBLA + EXTERNAL CGEMQRT, CTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -241,11 +245,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -256,38 +266,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(LW) + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -412,7 +422,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMTSQR diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 12e8373df..2044e055c 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,33 +165,35 @@ *> * ===================================================================== SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME INTEGER ILAENV REAL SROUNDUP_LWORK EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL CGELQT, CTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -200,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -213,60 +224,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(MB*M) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M , (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), $ LDA, T(1,CTR*M+1), $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), $ LDA, T(1,CTR*M+1), LDT, $ WORK, INFO ) - END IF + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLASWLQ diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f index 0502f6898..354141a8b 100644 --- a/lapack-netlib/SRC/clatrs3.f +++ b/lapack-netlib/SRC/clatrs3.f @@ -152,13 +152,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,15 +262,16 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE, SLARMM - EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM + REAL SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATRS, CSSCAL, XERBLA @@ -296,15 +302,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -326,7 +341,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -659,6 +674,9 @@ END IF END DO END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) +* RETURN * * End of CLATRS3 diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index cd2cb4aa7..67403693f 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -165,32 +168,34 @@ *> * ===================================================================== SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL CGEQRT, CTPQRT, XERBLA + EXTERNAL CGEQRT, CTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -201,6 +206,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -208,64 +220,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(NB*N) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, $ T(1,CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + IF( II.LE.M ) THEN + CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLATSQR diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index aee9b3f6a..2a1a2d4dc 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -352,6 +352,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index 32bae26dc..111eaa93e 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is REAL array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is REAL array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension LWORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -261,7 +266,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SISEV, SIZETAU, LDV, LHMIN, LWMIN @@ -283,7 +288,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -292,9 +296,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -315,8 +324,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -544,8 +553,7 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SB2ST From 29d6024ec534858294e648d189f9026302a903b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:44:11 +0100 Subject: [PATCH 051/191] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/dgebrd.f | 26 ++++-- lapack-netlib/SRC/dgehrd.f | 24 +++-- lapack-netlib/SRC/dgelq.f | 4 +- lapack-netlib/SRC/dgelqf.f | 20 ++-- lapack-netlib/SRC/dgelsd.f | 7 +- lapack-netlib/SRC/dgemlq.f | 32 ++++--- lapack-netlib/SRC/dgemqr.f | 32 ++++--- lapack-netlib/SRC/dgeqlf.f | 10 +- lapack-netlib/SRC/dgeqp3rk.f | 3 +- lapack-netlib/SRC/dgeqr.f | 20 ++-- lapack-netlib/SRC/dgeqrfp.f | 24 +++-- lapack-netlib/SRC/dgerqf.f | 4 +- lapack-netlib/SRC/dgesvj.f | 32 +++++-- lapack-netlib/SRC/dgetri.f | 5 +- lapack-netlib/SRC/dgetsls.f | 12 ++- lapack-netlib/SRC/dgetsqrhrt.f | 18 ++-- lapack-netlib/SRC/dgges.f | 12 +-- lapack-netlib/SRC/dgges3.f | 39 +++++--- lapack-netlib/SRC/dggev3.f | 37 ++++---- lapack-netlib/SRC/dgghd3.f | 17 ++-- lapack-netlib/SRC/dggqrf.f | 5 +- lapack-netlib/SRC/dggrqf.f | 4 +- lapack-netlib/SRC/dggsvd3.f | 4 +- lapack-netlib/SRC/dggsvp3.f | 4 +- lapack-netlib/SRC/dlamswlq.f | 62 ++++++++----- lapack-netlib/SRC/dlamtsqr.f | 74 ++++++++------- lapack-netlib/SRC/dlaswlq.f | 92 ++++++++++-------- lapack-netlib/SRC/dlatrs3.f | 27 +++++- lapack-netlib/SRC/dlatsqr.f | 92 ++++++++++-------- lapack-netlib/SRC/dsyev_2stage.f | 22 ++--- lapack-netlib/SRC/dsyevd.f | 5 +- lapack-netlib/SRC/dsyevr.f | 21 +++-- lapack-netlib/SRC/dsyevr_2stage.f | 40 ++++---- lapack-netlib/SRC/dsyevx.f | 6 +- lapack-netlib/SRC/dsysv_aa.f | 13 +-- lapack-netlib/SRC/dsysv_aa_2stage.f | 24 ++--- lapack-netlib/SRC/dsysvx.f | 9 +- lapack-netlib/SRC/dsytrd.f | 4 +- lapack-netlib/SRC/dsytrd_2stage.f | 89 ++++++++++-------- lapack-netlib/SRC/dsytrd_sb2st.F | 134 ++++++++++++++------------- lapack-netlib/SRC/dsytrd_sy2sb.f | 18 ++-- lapack-netlib/SRC/dsytrf_aa.f | 27 ++++-- lapack-netlib/SRC/dsytrf_aa_2stage.f | 22 ++--- lapack-netlib/SRC/dsytrf_rk.f | 6 +- lapack-netlib/SRC/dsytrf_rook.f | 4 +- lapack-netlib/SRC/dsytri2.f | 27 +++--- lapack-netlib/SRC/dsytri_3.f | 21 +++-- lapack-netlib/SRC/dsytrs_aa.f | 27 ++++-- 48 files changed, 751 insertions(+), 510 deletions(-) diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f index 0f0d1651a..ac11d48a0 100644 --- a/lapack-netlib/SRC/dgebrd.f +++ b/lapack-netlib/SRC/dgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL DGEBD2, DGEMM, DLABRD, XERBLA @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f index a40c61cb6..d95bbd182 100644 --- a/lapack-netlib/SRC/dgehrd.f +++ b/lapack-netlib/SRC/dgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, $ ONE = 1.0D+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - DOUBLE PRECISION EI + DOUBLE PRECISION EI * .. * .. External Subroutines .. EXTERNAL DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF WORK( 1 ) = LWKOPT END IF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -344,6 +349,7 @@ * Use unblocked code to reduce the rest of the matrix * CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) +* WORK( 1 ) = LWKOPT * RETURN diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index 013b6c356..255e8732f 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f index ed3372f96..f0eb00a55 100644 --- a/lapack-netlib/SRC/dgelqf.f +++ b/lapack-netlib/SRC/dgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f index b1f45a2c6..7dc564f48 100644 --- a/lapack-netlib/SRC/dgelsd.f +++ b/lapack-netlib/SRC/dgelsd.f @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -228,7 +228,7 @@ DOUBLE PRECISION ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM * .. * .. External Subroutines .. - EXTERNAL DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD, + EXTERNAL DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD, $ DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA * .. * .. External Functions .. @@ -276,7 +276,7 @@ $ LOG( TWO ) ) + 1, 0 ) * IF( INFO.EQ.0 ) THEN - MAXWRK = 0 + MAXWRK = 1 LIWORK = 3*MINMN*NLVL + 11*MINMN MM = M IF( M.GE.N .AND. M.GE.MNTHR ) THEN @@ -372,7 +372,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f index 3ba209105..757683f46 100644 --- a/lapack-netlib/SRC/dgemlq.f +++ b/lapack-netlib/SRC/dgemlq.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,11 +161,13 @@ *> block sizes MB and NB returned by ILAENV, DGELQ will use either *> DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute *> the LQ factorization. -*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to +*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in DLAMSWLQ or DGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -186,7 +189,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -202,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -217,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -245,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -262,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -275,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f index 022cf21e4..608815483 100644 --- a/lapack-netlib/SRC/dgemqr.f +++ b/lapack-netlib/SRC/dgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,12 +161,14 @@ *> block sizes MB and NB returned by ILAENV, DGEQR will use either *> DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute *> the QR factorization. -*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to +*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to *> multiply matrix Q by another matrix. *> Further Details in DLATMSQR or DGEMQRT. *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f index b8ac0b1a0..a72d9dc76 100644 --- a/lapack-netlib/SRC/dgeqlf.f +++ b/lapack-netlib/SRC/dgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f index 117a68287..b8e41b39c 100644 --- a/lapack-netlib/SRC/dgeqp3rk.f +++ b/lapack-netlib/SRC/dgeqp3rk.f @@ -427,7 +427,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= (3*N + NRHS - 1) +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. *> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), *> where NB is the optimal block size for DGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f index eac8930ce..6ed8f211f 100644 --- a/lapack-netlib/SRC/dgeqr.f +++ b/lapack-netlib/SRC/dgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f index 46d2ee479..aa757e96c 100644 --- a/lapack-netlib/SRC/dgeqrfp.f +++ b/lapack-netlib/SRC/dgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL DGEQR2P, DLARFB, DLARFT, XERBLA @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f index cca9d6367..435239cc7 100644 --- a/lapack-netlib/SRC/dgerqf.f +++ b/lapack-netlib/SRC/dgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -189,7 +189,7 @@ END IF WORK( 1 ) = LWKOPT * - IF ( .NOT.LQUERY ) THEN + IF( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) $ INFO = -7 END IF diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f index 5fdb21e45..198bfb0a5 100644 --- a/lapack-netlib/SRC/dgesvj.f +++ b/lapack-netlib/SRC/dgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On entry : *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -365,9 +370,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. DOUBLE PRECISION FASTR( 5 ) @@ -408,6 +413,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -427,7 +440,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -437,11 +450,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f index 92ef90c18..7b5a3a1b6 100644 --- a/lapack-netlib/SRC/dgetri.f +++ b/lapack-netlib/SRC/dgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -151,8 +151,9 @@ * INFO = 0 NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index 25f4c12c2..73b505ff7 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -189,7 +189,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, DLANGE + EXTERNAL LSAME, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DGEQR, DGEMQR, DLASCL, DLASET, @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEM = 1 + WSIZEO = 1 + ELSE IF( M.GE.N ) THEN CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -294,7 +297,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f index 668deeba8..682c7c30f 100644 --- a/lapack-netlib/SRC/dgetsqrhrt.f +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +215,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +228,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +266,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -346,4 +350,4 @@ * * End of DGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f index 31db23715..b9ffc7982 100644 --- a/lapack-netlib/SRC/dgges.f +++ b/lapack-netlib/SRC/dgges.f @@ -234,8 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If N = 0, LWORK >= 1, else LWORK >= 8*N+16. -*> For good performance , LWORK must generally be larger. +*> If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -321,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -431,7 +430,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f index 7b00d294a..2ef55951a 100644 --- a/lapack-netlib/SRC/dgges3.f +++ b/lapack-netlib/SRC/dgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -273,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -318,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -362,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -378,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -386,29 +394,33 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1, $ IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +442,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f index 4bbe8a40f..b970c04c4 100644 --- a/lapack-netlib/SRC/dggev3.f +++ b/lapack-netlib/SRC/dggev3.f @@ -188,7 +188,9 @@ *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER +*> LWORK is INTEGER. +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -217,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -256,9 +259,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -299,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -313,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -321,13 +324,13 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF IF( ILV ) THEN CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, @@ -336,18 +339,21 @@ CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +373,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f index 43d7a77df..21a668573 100644 --- a/lapack-netlib/SRC/dgghd3.f +++ b/lapack-netlib/SRC/dgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -211,7 +211,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -885,6 +889,7 @@ IF ( JCOL.LT.IHI ) $ CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DBLE( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 39d27a5c9..edac7f22f 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN @@ -287,6 +287,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) +* WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) * RETURN diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index ddf4104c5..3b1024c1c 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f index 503f0d8cc..ee4d11e86 100644 --- a/lapack-netlib/SRC/dggsvd3.f +++ b/lapack-netlib/SRC/dggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -328,7 +328,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f index 4e1db3117..485d95b36 100644 --- a/lapack-netlib/SRC/dggsvp3.f +++ b/lapack-netlib/SRC/dggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f index 70e78f4b1..07ef1bd57 100644 --- a/lapack-netlib/SRC/dlamswlq.f +++ b/lapack-netlib/SRC/dlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,29 +192,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, CTR, LW + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, CTR, LW, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -223,52 +228,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +415,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMSWLQ diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f index 962a31476..023db5ac9 100644 --- a/lapack-netlib/SRC/dlamtsqr.f +++ b/lapack-netlib/SRC/dlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,29 +193,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -225,12 +229,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +243,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +264,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +421,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMTSQR diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f index c95c94cbc..636c12dc8 100644 --- a/lapack-netlib/SRC/dlaswlq.f +++ b/lapack-netlib/SRC/dlaswlq.f @@ -99,19 +99,22 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +162,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL DGELQT, DTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +203,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LT.0 ) THEN INFO = -4 @@ -209,60 +223,62 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF +* + WORK( 1 ) = LWMIN * - WORK( 1 ) = M * MB RETURN * * End of DLASWLQ diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f index e6d78b672..d18675b2d 100644 --- a/lapack-netlib/SRC/dlatrs3.f +++ b/lapack-netlib/SRC/dlatrs3.f @@ -151,13 +151,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +169,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +258,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -292,15 +297,24 @@ * row. WORK( I+KK*LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -649,6 +663,9 @@ END IF END DO END DO +* + WORK( 1 ) = LWMIN +* RETURN * * End of DLATRS3 diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f index 94a04be02..0000aab68 100644 --- a/lapack-netlib/SRC/dlatsqr.f +++ b/lapack-netlib/SRC/dlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,27 +164,29 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T(LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME @@ -198,6 +203,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,65 +217,67 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF +* IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF * - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of DLATSQR diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f index 50d51d992..286366bfe 100644 --- a/lapack-netlib/SRC/dsyev_2stage.f +++ b/lapack-netlib/SRC/dsyev_2stage.f @@ -20,7 +20,7 @@ * Definition: * =========== * -* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, * INFO ) * * IMPLICIT NONE @@ -97,7 +97,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -105,12 +105,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -143,7 +143,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -161,7 +161,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -169,16 +169,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== - SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, $ INFO ) * IMPLICIT NONE @@ -305,7 +305,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call DSTERF. For eigenvectors, first call diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index b27f4cdc7..adcfcb373 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, -*> dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -160,7 +159,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevd * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f index 698691533..8647b0162 100644 --- a/lapack-netlib/SRC/dsyevr.f +++ b/lapack-netlib/SRC/dsyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for DSYTRD and DORMTR *> returned by ILAENV. @@ -285,13 +286,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -315,7 +317,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -390,8 +392,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -450,7 +457,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f index 09242bbd3..63d5e3159 100644 --- a/lapack-netlib/SRC/dsyevr_2stage.f +++ b/lapack-netlib/SRC/dsyevr_2stage.f @@ -263,7 +263,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the orthogonal transformations applied by DORMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -277,12 +277,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 5*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -330,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -358,7 +360,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -366,11 +368,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -444,8 +446,14 @@ IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -484,7 +492,7 @@ * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN END IF * @@ -504,7 +512,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -608,7 +616,7 @@ * Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. * * - CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) * @@ -727,7 +735,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f index 99719874b..fd6a78e32 100644 --- a/lapack-netlib/SRC/dsyevx.f +++ b/lapack-netlib/SRC/dsyevx.f @@ -244,7 +244,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT END IF + WORK( 1 ) = LWKOPT * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index 8dab5a384..0a96ecd7e 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index 72fbe1e9a..90dd0a38a 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -101,14 +101,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -148,14 +148,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -179,7 +180,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -205,7 +206,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -226,6 +227,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +236,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +258,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f index a30831e72..b2b8210ca 100644 --- a/lapack-netlib/SRC/dsysvx.f +++ b/lapack-netlib/SRC/dsysvx.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f index 3dcfc3db2..58d4b633b 100644 --- a/lapack-netlib/SRC/dsytrd.f +++ b/lapack-netlib/SRC/dsytrd.f @@ -139,7 +139,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd * *> \par Further Details: * ===================== @@ -247,7 +247,7 @@ * Determine the block size. * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f index 8ae77d3e4..a88ac1c73 100644 --- a/lapack-netlib/SRC/dsytrd_2stage.f +++ b/lapack-netlib/SRC/dsytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download DSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download DSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * DOUBLE PRECISION A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is DOUBLE PRECISION array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2) +*> HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SY2SB', -INFO ) RETURN END IF - CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SB2ST', -INFO ) @@ -324,8 +332,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_2STAGE diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index bb74dd491..04d03d587 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the dsytrd_sy2sb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the dsytrd_sy2sb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the dsytrd_sy2sb *> routine has been called to produce AB (e.g., AB is *> the output of dsytrd_sy2sb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is DOUBLE PRECISION array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup real16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * #if defined(_OPENMP) @@ -258,11 +263,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIDEV, SIZETAU, LDV, LHMIN, LWMIN * .. @@ -274,7 +279,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -282,7 +287,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -291,9 +295,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -355,7 +364,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -363,11 +372,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* real because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -382,17 +391,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = ( AB( ABDPOS, I ) ) @@ -413,7 +422,7 @@ RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the symmetric band of A to a tridiagonal matrix. * THGRSIZ = N @@ -422,7 +431,7 @@ NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL DLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -431,7 +440,7 @@ * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -440,7 +449,7 @@ #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -451,7 +460,7 @@ ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -477,16 +486,16 @@ ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -494,20 +503,20 @@ !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -522,14 +531,14 @@ !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -543,11 +552,10 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_SB2ST * END - + diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 1660b5c7e..38acc71f1 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f index 9a0b26ce5..924d4c165 100644 --- a/lapack-netlib/SRC/dsytrf_aa.f +++ b/lapack-netlib/SRC/dsytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB DOUBLE PRECISION ALPHA * .. @@ -179,18 +181,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index c65bd86e6..fae95bab2 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -211,9 +211,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -227,10 +227,10 @@ NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -239,7 +239,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f index 086586968..0717eb076 100644 --- a/lapack-netlib/SRC/dsytrf_rk.f +++ b/lapack-netlib/SRC/dsytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f index 2f00d1802..316663485 100644 --- a/lapack-netlib/SRC/dsytrf_rook.f +++ b/lapack-netlib/SRC/dsytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index dbcdcdb58..5960d3992 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of DSYTRI2 diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f index 86d69cdfd..50834c605 100644 --- a/lapack-netlib/SRC/dsytri_3.f +++ b/lapack-netlib/SRC/dsytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -152,7 +153,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -208,8 +209,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = LWKOPT * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -217,7 +223,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -225,7 +231,6 @@ CALL XERBLA( 'DSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f index 26b11a2a0..f0016cb7f 100644 --- a/lapack-netlib/SRC/dsytrs_aa.f +++ b/lapack-netlib/SRC/dsytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -123,7 +129,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -151,7 +157,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -161,13 +167,19 @@ EXTERNAL DLACPY, DGTSV, DSWAP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -178,21 +190,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN From c082669ad46b0f87be6730249b937def5060f80c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 20:05:03 +0100 Subject: [PATCH 052/191] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/sgebrd.f | 28 +++++--- lapack-netlib/SRC/sgehrd.f | 26 +++++--- lapack-netlib/SRC/sgelq.f | 8 +-- lapack-netlib/SRC/sgelqf.f | 20 ++++-- lapack-netlib/SRC/sgemlq.f | 24 ++++--- lapack-netlib/SRC/sgemqr.f | 22 +++++-- lapack-netlib/SRC/sgeqlf.f | 8 ++- lapack-netlib/SRC/sgeqp3rk.f | 24 +++---- lapack-netlib/SRC/sgeqr.f | 22 ++++--- lapack-netlib/SRC/sgeqrfp.f | 28 +++++--- lapack-netlib/SRC/sgesvj.f | 36 ++++++++--- lapack-netlib/SRC/sgetri.f | 10 +-- lapack-netlib/SRC/sgetsls.f | 7 +- lapack-netlib/SRC/sgetsqrhrt.f | 16 +++-- lapack-netlib/SRC/sgges3.f | 24 +++++-- lapack-netlib/SRC/sggev3.f | 27 +++++--- lapack-netlib/SRC/sgghd3.f | 15 +++-- lapack-netlib/SRC/sggqrf.f | 9 ++- lapack-netlib/SRC/sggrqf.f | 2 +- lapack-netlib/SRC/sggsvd3.f | 2 +- lapack-netlib/SRC/sggsvp3.f | 5 +- lapack-netlib/SRC/slamswlq.f | 64 ++++++++++++------- lapack-netlib/SRC/slamtsqr.f | 76 +++++++++++++--------- lapack-netlib/SRC/slaswlq.f | 92 +++++++++++++++------------ lapack-netlib/SRC/slatrs3.f | 32 +++++++--- lapack-netlib/SRC/slatsqr.f | 95 ++++++++++++++++------------ lapack-netlib/SRC/ssyevd.f | 7 +- lapack-netlib/SRC/ssyevr.f | 19 ++++-- lapack-netlib/SRC/ssyevr_2stage.f | 22 +++++-- lapack-netlib/SRC/ssyevx.f | 6 +- lapack-netlib/SRC/ssysv_aa.f | 18 +++--- lapack-netlib/SRC/ssysv_aa_2stage.f | 27 ++++---- lapack-netlib/SRC/ssysvx.f | 7 +- lapack-netlib/SRC/ssytrd_2stage.f | 86 +++++++++++++------------ lapack-netlib/SRC/ssytrd_sy2sb.f | 18 ++++-- lapack-netlib/SRC/ssytrf.f | 5 +- lapack-netlib/SRC/ssytrf_aa.f | 36 +++++++---- lapack-netlib/SRC/ssytrf_aa_2stage.f | 18 +++--- lapack-netlib/SRC/ssytrf_rk.f | 8 +-- lapack-netlib/SRC/ssytrf_rook.f | 7 +- lapack-netlib/SRC/ssytri2.f | 32 ++++++---- lapack-netlib/SRC/ssytri_3.f | 21 +++--- lapack-netlib/SRC/ssytrs_aa.f | 30 ++++++--- 43 files changed, 671 insertions(+), 418 deletions(-) diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f index 2d0c6d651..b33ad0b1f 100644 --- a/lapack-netlib/SRC/sgebrd.f +++ b/lapack-netlib/SRC/sgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL SGEBD2, SGEMM, SLABRD, XERBLA @@ -242,9 +243,16 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -252,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -264,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -283,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -342,7 +349,8 @@ * CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(WS) +* + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of SGEBRD diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f index 47733d947..cfa17e156 100644 --- a/lapack-netlib/SRC/sgehrd.f +++ b/lapack-netlib/SRC/sgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - REAL A( LDA, * ), TAU( * ), WORK( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, $ ONE = 1.0E+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - REAL EI + REAL EI * .. * .. External Subroutines .. EXTERNAL SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM, @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 74c7cc267..75f02675d 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f index 1ceec4742..3b3913d84 100644 --- a/lapack-netlib/SRC/sgelqf.f +++ b/lapack-netlib/SRC/sgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGELQF diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f index 83536825c..7e4d9bf65 100644 --- a/lapack-netlib/SRC/sgemlq.f +++ b/lapack-netlib/SRC/sgemlq.f @@ -110,13 +110,14 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this *> value as WORK(1), and no error message related to WORK @@ -187,7 +188,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -207,7 +208,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -222,6 +223,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -250,12 +258,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -267,7 +275,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -280,7 +288,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f index 3207f8bfd..19bf467b8 100644 --- a/lapack-netlib/SRC/sgemqr.f +++ b/lapack-netlib/SRC/sgemqr.f @@ -189,12 +189,13 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMQRT, SLAMTSQR, XERBLA @@ -206,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -221,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -249,12 +257,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -266,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -279,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f index b1266c89e..14942b765 100644 --- a/lapack-netlib/SRC/sgeqlf.f +++ b/lapack-netlib/SRC/sgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -189,8 +190,9 @@ END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f index bb5da72dc..d3a335b88 100644 --- a/lapack-netlib/SRC/sgeqp3rk.f +++ b/lapack-netlib/SRC/sgeqp3rk.f @@ -427,7 +427,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= (3*N + NRHS - 1) +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. *> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), *> where NB is the optimal block size for SGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. @@ -618,8 +619,9 @@ * .. External Functions .. LOGICAL SISNAN INTEGER ISAMAX, ILAENV - REAL SLAMCH, SNRM2 - EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV + REAL SLAMCH, SNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC REAL, MAX, MIN @@ -696,7 +698,7 @@ * LWKOPT = 2*N + NB*( N+NRHS+1 ) END IF - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -719,7 +721,7 @@ K = 0 MAXC2NRMK = ZERO RELMAXC2NRMK = ZERO - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -772,7 +774,7 @@ * * Array TAU is not set and contains undefined elements. * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -791,7 +793,7 @@ TAU( J ) = ZERO END DO * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * END IF @@ -822,7 +824,7 @@ DO J = 1, MINMN TAU( J ) = ZERO END DO - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -867,7 +869,7 @@ TAU( J ) = ZERO END DO * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -985,7 +987,7 @@ * * Return from the routine. * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * @@ -1072,7 +1074,7 @@ * END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f index 6f41a92ea..79a515e1c 100644 --- a/lapack-netlib/SRC/sgeqr.f +++ b/lapack-netlib/SRC/sgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,13 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLATSQR, SGEQRT, XERBLA @@ -244,8 +248,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +259,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +274,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +288,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +315,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f index d1ee2a828..37747c512 100644 --- a/lapack-netlib/SRC/sgeqrfp.f +++ b/lapack-netlib/SRC/sgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL SGEQR2P, SLARFB, SLARFT, XERBLA @@ -173,8 +174,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -182,8 +184,16 @@ * INFO = 0 NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +201,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -211,7 +221,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +283,7 @@ $ CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGEQRFP diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f index 3f53a5a15..36aed2853 100644 --- a/lapack-netlib/SRC/sgesvj.f +++ b/lapack-netlib/SRC/sgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On entry, *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> Length of WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -351,9 +356,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. REAL FASTR( 5 ) @@ -369,8 +374,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -394,6 +399,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -413,7 +426,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -423,11 +436,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f index fe71bc4a5..7b06bb63d 100644 --- a/lapack-netlib/SRC/sgetri.f +++ b/lapack-netlib/SRC/sgetri.f @@ -137,8 +137,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA @@ -152,8 +153,9 @@ * INFO = 0 NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -251,7 +253,7 @@ $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGETRI diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index d89c6a4e6..08a427a8b 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f index d80ff4da8..7ade8a66c 100644 --- a/lapack-netlib/SRC/sgetsqrhrt.f +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -216,7 +219,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -229,7 +232,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -267,8 +270,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -350,4 +354,4 @@ * * End of SGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f index e35d4955a..e90cd6947 100644 --- a/lapack-netlib/SRC/sgges3.f +++ b/lapack-netlib/SRC/sgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -361,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -377,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -385,7 +394,7 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) @@ -407,7 +416,11 @@ $ IERR ) LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -421,6 +434,7 @@ * IF( N.EQ.0 ) THEN SDIM = 0 + WORK( 1 ) = 1 RETURN END IF * @@ -657,7 +671,7 @@ * 40 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f index c82d2187f..d788d1147 100644 --- a/lapack-netlib/SRC/sggev3.f +++ b/lapack-netlib/SRC/sggev3.f @@ -189,6 +189,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -298,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -312,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -320,28 +324,31 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) -* END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f index 9c5858b5a..01e57088a 100644 --- a/lapack-netlib/SRC/sgghd3.f +++ b/lapack-netlib/SRC/sgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -276,7 +276,12 @@ * INFO = 0 NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -326,7 +331,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -886,6 +890,7 @@ IF ( JCOL.LT.IHI ) $ CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index ebb42a899..d32b48410 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -236,8 +236,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,8 +252,9 @@ NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -289,6 +291,7 @@ * CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index 2163f1ef8..b3842ec2a 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f index 053fff5de..cee630593 100644 --- a/lapack-netlib/SRC/sggsvd3.f +++ b/lapack-netlib/SRC/sggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f index a463b9064..8e90d770c 100644 --- a/lapack-netlib/SRC/sggsvp3.f +++ b/lapack-netlib/SRC/sggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -300,8 +300,9 @@ * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT, diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f index d4996b1f2..432afaded 100644 --- a/lapack-netlib/SRC/slamswlq.f +++ b/lapack-netlib/SRC/slamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,33 +192,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL STPMLQT, SGEMLQT, XERBLA * .. @@ -223,52 +231,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +418,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMSWLQ diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f index 960b794de..f9b167aea 100644 --- a/lapack-netlib/SRC/slamtsqr.f +++ b/lapack-netlib/SRC/slamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,33 +193,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SGEMQRT, STPMQRT, XERBLA * .. @@ -225,12 +232,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +246,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +267,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +424,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMTSQR diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index 685f823a0..594c646db 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB * M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim + *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,32 +165,35 @@ *> * ===================================================================== SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T( LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -199,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -212,60 +224,60 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLASWLQ diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f index 8f0c4bf16..17052289e 100644 --- a/lapack-netlib/SRC/slatrs3.f +++ b/lapack-netlib/SRC/slatrs3.f @@ -151,13 +151,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). -*> On exit, if INFO = 0, WORK(1) returns the optimal size of -*> WORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +168,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +185,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +257,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -264,7 +268,8 @@ EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM * .. * .. External Subroutines .. - EXTERNAL SLATRS, SSCAL, XERBLA + REAL SROUNDUP_LWORK + EXTERNAL SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -292,15 +297,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -650,6 +664,8 @@ END DO END DO RETURN +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * End of SLATRS3 * diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f index 86733bb15..4730815b5 100644 --- a/lapack-netlib/SRC/slatsqr.f +++ b/lapack-netlib/SRC/slatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,39 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T(LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGEQRT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +207,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +221,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLATSQR diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index a5e4638d6..2ae44fc81 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, -*> dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -251,7 +250,7 @@ $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -335,7 +334,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f index 47e4d7cbf..870facd60 100644 --- a/lapack-netlib/SRC/ssyevr.f +++ b/lapack-netlib/SRC/ssyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for SSYTRD and SORMTR *> returned by ILAENV. @@ -292,7 +293,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK and @@ -392,8 +394,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -428,7 +435,7 @@ NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -677,7 +684,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f index a2d6a6231..471e25977 100644 --- a/lapack-netlib/SRC/ssyevr_2stage.f +++ b/lapack-netlib/SRC/ssyevr_2stage.f @@ -278,6 +278,7 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -445,8 +447,14 @@ IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -485,7 +493,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN END IF * @@ -505,7 +513,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 26 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -733,7 +741,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f index 2204aa39b..aaed6dad5 100644 --- a/lapack-netlib/SRC/ssyevx.f +++ b/lapack-netlib/SRC/ssyevx.f @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = SROUNDUP_LWORK(LWKMIN) + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 @@ -542,7 +542,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index e43d4de7f..711a275e1 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -177,12 +177,13 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRS_AA, SSYTRF_AA @@ -196,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index 3d88e068e..fb068b3bf 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is REAL array, dimension (LTB) +*> TB is REAL array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -204,12 +205,13 @@ * .. * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE, @@ -226,6 +228,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +237,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +259,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -269,7 +272,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f index 0d72217eb..06a6413f1 100644 --- a/lapack-netlib/SRC/ssysvx.f +++ b/lapack-netlib/SRC/ssysvx.f @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f index 5d70ae0d4..5b401c3d0 100644 --- a/lapack-netlib/SRC/ssytrd_2stage.f +++ b/lapack-netlib/SRC/ssytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download SSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download SSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * REAL A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is REAL array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is REAL array, dimension (LHOUS2) +*> HOUS2 is REAL array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -149,17 +151,19 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +181,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +206,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +214,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +269,13 @@ * KD = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +316,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN END IF - CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SB2ST', -INFO ) @@ -324,8 +331,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of SSYTRD_2STAGE diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index 4efc43630..3996e07bb 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -124,7 +124,7 @@ *> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SY2SB diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index a788fbcf0..55f3a4f0f 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -234,7 +234,7 @@ * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -353,7 +353,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index d6408a978..af32fb064 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -142,19 +144,19 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), WORK( * ) + REAL A( LDA, * ), WORK( * ) * .. * * ===================================================================== * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB - REAL ALPHA + REAL ALPHA * .. * .. External Functions .. LOGICAL LSAME @@ -180,19 +182,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -204,11 +213,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * @@ -458,7 +467,8 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_AA diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index abe6564c5..6b5cdee1b 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -94,7 +94,7 @@ *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index 72830543c..89ecf38fd 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension ( MAX(1,LWORK) ). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -312,7 +312,7 @@ * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_RK diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f index 339a229e7..7c2cbbc57 100644 --- a/lapack-netlib/SRC/ssytrf_rook.f +++ b/lapack-netlib/SRC/ssytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -260,7 +260,7 @@ * NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -383,7 +383,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_ROOK diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 49f6cad65..fd1c53473 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRI, SSYTRI2X, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - - IF( NBMAX .GE. N ) THEN +* + IF( NBMAX.GE.N ) THEN CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of SSYTRI2 diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f index bca01105d..f0152a149 100644 --- a/lapack-netlib/SRC/ssytri_3.f +++ b/lapack-netlib/SRC/ssytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'SSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f index 12fca0c71..265cf0c1d 100644 --- a/lapack-netlib/SRC/ssytrs_aa.f +++ b/lapack-netlib/SRC/ssytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -141,7 +147,7 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), B( LDB, * ), WORK( * ) + REAL A( LDA, * ), B( LDB, * ), WORK( * ) * .. * * ===================================================================== @@ -151,24 +157,31 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGTSV, SSWAP, SLACPY, STRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +192,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN From 45ef0d7361f05d96828534264f0536a492b14180 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 20:16:33 +0100 Subject: [PATCH 053/191] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/zgebrd.f | 26 ++++-- lapack-netlib/SRC/zgehrd.f | 23 +++-- lapack-netlib/SRC/zgelq.f | 4 +- lapack-netlib/SRC/zgelqf.f | 20 ++-- lapack-netlib/SRC/zgemlq.f | 28 ++++-- lapack-netlib/SRC/zgemqr.f | 30 ++++-- lapack-netlib/SRC/zgeqlf.f | 10 +- lapack-netlib/SRC/zgeqp3rk.f | 3 +- lapack-netlib/SRC/zgeqr.f | 20 ++-- lapack-netlib/SRC/zgeqrfp.f | 24 +++-- lapack-netlib/SRC/zgesvj.f | 83 ++++++++++------- lapack-netlib/SRC/zgetri.f | 4 +- lapack-netlib/SRC/zgetsls.f | 12 ++- lapack-netlib/SRC/zgetsqrhrt.f | 15 +-- lapack-netlib/SRC/zgges3.f | 34 ++++--- lapack-netlib/SRC/zggev3.f | 24 +++-- lapack-netlib/SRC/zgghd3.f | 15 ++- lapack-netlib/SRC/zggqrf.f | 4 +- lapack-netlib/SRC/zggrqf.f | 4 +- lapack-netlib/SRC/zggsvd3.f | 4 +- lapack-netlib/SRC/zggsvp3.f | 4 +- lapack-netlib/SRC/zheevd.f | 5 +- lapack-netlib/SRC/zheevr.f | 29 ++++-- lapack-netlib/SRC/zheevr_2stage.f | 42 +++++---- lapack-netlib/SRC/zhesv_aa.f | 15 +-- lapack-netlib/SRC/zhesv_aa_2stage.f | 23 +++-- lapack-netlib/SRC/zhesvx.f | 13 +-- lapack-netlib/SRC/zhetrd_2stage.f | 87 +++++++++-------- lapack-netlib/SRC/zhetrd_hb2st.F | 134 ++++++++++++++------------- lapack-netlib/SRC/zhetrd_he2hb.f | 18 ++-- lapack-netlib/SRC/zhetrf.f | 7 +- lapack-netlib/SRC/zhetrf_aa.f | 27 ++++-- lapack-netlib/SRC/zhetrf_aa_2stage.f | 28 +++--- lapack-netlib/SRC/zhetrf_rk.f | 8 +- lapack-netlib/SRC/zhetrf_rook.f | 4 +- lapack-netlib/SRC/zhetri2.f | 27 +++--- lapack-netlib/SRC/zhetrs_aa.f | 27 ++++-- lapack-netlib/SRC/zlamswlq.f | 68 ++++++++------ lapack-netlib/SRC/zlamtsqr.f | 76 ++++++++------- lapack-netlib/SRC/zlaswlq.f | 91 ++++++++++-------- lapack-netlib/SRC/zlatrs3.f | 22 ++++- lapack-netlib/SRC/zlatsqr.f | 95 +++++++++++-------- 42 files changed, 733 insertions(+), 504 deletions(-) diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f index f1791c6a4..c1a6169a7 100644 --- a/lapack-netlib/SRC/zgebrd.f +++ b/lapack-netlib/SRC/zgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEBD2, ZGEMM, ZLABRD @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f index e18493cf9..0f4424ded 100644 --- a/lapack-netlib/SRC/zgehrd.f +++ b/lapack-netlib/SRC/zgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - COMPLEX*16 ZERO, ONE + COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - COMPLEX*16 EI + COMPLEX*16 EI * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF WORK( 1 ) = LWKOPT ENDIF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index de7c9a378..86610e801 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f index 6c295eece..e988ea818 100644 --- a/lapack-netlib/SRC/zgelqf.f +++ b/lapack-netlib/SRC/zgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f index 41cd1c059..11489087a 100644 --- a/lapack-netlib/SRC/zgemlq.f +++ b/lapack-netlib/SRC/zgemlq.f @@ -109,16 +109,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -142,7 +143,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -158,11 +159,13 @@ *> block sizes MB and NB returned by ILAENV, ZGELQ will use either *> ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute *> the LQ factorization. -*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to +*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in ZLAMSWLQ or ZGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -184,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -200,7 +203,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -215,6 +218,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -243,7 +253,7 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * @@ -260,7 +270,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f index c83eaff2f..d14d74fe2 100644 --- a/lapack-netlib/SRC/zgemqr.f +++ b/lapack-netlib/SRC/zgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f index 94721540c..a27612c64 100644 --- a/lapack-netlib/SRC/zgeqlf.f +++ b/lapack-netlib/SRC/zgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f index 247a3c379..01dcce0de 100644 --- a/lapack-netlib/SRC/zgeqp3rk.f +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -428,7 +428,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= N+NRHS-1 +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. *> For optimal performance LWORK >= NB*( N+NRHS+1 ), *> where NB is the optimal block size for ZGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f index 20a80d083..7df9c2403 100644 --- a/lapack-netlib/SRC/zgeqr.f +++ b/lapack-netlib/SRC/zgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f index 73bcde667..3562de36e 100644 --- a/lapack-netlib/SRC/zgeqrfp.f +++ b/lapack-netlib/SRC/zgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEQR2P, ZLARFB, ZLARFT @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f index 6cd2335f2..2be45d826 100644 --- a/lapack-netlib/SRC/zgesvj.f +++ b/lapack-netlib/SRC/zgesvj.f @@ -200,23 +200,25 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', then LDV >= max(1,N). -*> If JOBV = 'A', then LDV >= max(1,MV) . +*> If JOBV = 'V', then LDV >= MAX(1,N). +*> If JOBV = 'A', then LDV >= MAX(1,MV) . *> \endverbatim *> *> \param[in,out] CWORK *> \verbatim -*> CWORK is COMPLEX*16 array, dimension (max(1,LWORK)) +*> CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise. +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -367,23 +371,25 @@ * * .. Local Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) - COMPLEX*16 CZERO, CONE - PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) - INTEGER NSWEEP - PARAMETER ( NSWEEP = 30 ) + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) + INTEGER NSWEEP + PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX*16 AAPQ, OMPQ - DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX*16 AAPQ, OMPQ + DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, + $ TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, + $ SWBAND, MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -422,7 +428,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M+N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +457,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +469,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f index 41782841c..f3806a77c 100644 --- a/lapack-netlib/SRC/zgetri.f +++ b/lapack-netlib/SRC/zgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -152,7 +152,7 @@ * INFO = 0 NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 17c6d5146..26311c611 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -192,7 +192,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEQR, ZGEMQR, ZLASCL, ZLASET, @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -297,7 +300,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f index 5f0167937..e7ce993aa 100644 --- a/lapack-netlib/SRC/zgetsqrhrt.f +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpex16OTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +214,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +227,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +265,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f index 8b3e44f88..8235c2543 100644 --- a/lapack-netlib/SRC/zgges3.f +++ b/lapack-netlib/SRC/zgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N) +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -309,9 +311,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,28 +380,32 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) END IF CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -420,7 +427,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f index 2d6c74582..0cc073470 100644 --- a/lapack-netlib/SRC/zggev3.f +++ b/lapack-netlib/SRC/zggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKMIN, LWKOPT DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX*16 X @@ -252,9 +253,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = DLAMCH( 'E' )*DLAMCH( 'B' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f index b29cdc70a..08343688d 100644 --- a/lapack-netlib/SRC/zgghd3.f +++ b/lapack-netlib/SRC/zgghd3.f @@ -176,14 +176,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DCMPLX( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -883,6 +887,7 @@ IF ( JCOL.LT.IHI ) $ CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DCMPLX( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 0388b0874..d8636d663 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index be912c772..69c14af24 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f index 71257a7c0..40624f5be 100644 --- a/lapack-netlib/SRC/zggsvd3.f +++ b/lapack-netlib/SRC/zggsvd3.f @@ -277,7 +277,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -332,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f index f39ccdad3..7b465aaee 100644 --- a/lapack-netlib/SRC/zggsvp3.f +++ b/lapack-netlib/SRC/zggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index ba52f9e72..8e86b9e88 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is DOUBLE PRECISION array, -*> dimension (LRWORK) +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -180,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevd * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f index 1452e04a3..fe6e1a85f 100644 --- a/lapack-netlib/SRC/zheevr.f +++ b/lapack-netlib/SRC/zheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for ZHETRD and for *> ZUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -338,7 +341,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,7 +463,7 @@ NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -710,7 +719,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f index 5c576e633..b1cc7175f 100644 --- a/lapack-netlib/SRC/zheevr_2stage.f +++ b/lapack-netlib/SRC/zheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by ZUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -472,9 +475,16 @@ IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -535,7 +545,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -643,9 +653,9 @@ * * Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index df8498c7a..b3d4b3725 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -128,7 +128,7 @@ *> LWORK is INTEGER *> The length of WORK. LWORK >= MAX(1,2*N,3*N-2), and for best *> performance LWORK >= max(1,N*NB), where NB is the optimal -*> blocksize for ZHETRF. +*> blocksize for ZHETRF_AA. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index 79c01c546..c503b5554 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -178,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -208,7 +209,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKOPT, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -229,6 +230,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -237,18 +239,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f index 485c81df6..64aa16674 100644 --- a/lapack-netlib/SRC/zhesvx.f +++ b/lapack-netlib/SRC/zhesvx.f @@ -234,8 +234,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= max(1,2*N), and for best -*> performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where +*> The length of WORK. LWORK >= MAX(1,2*N), and for best +*> performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where *> NB is the optimal blocksize for ZHETRF. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKOPT, LWKMIN, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,12 +347,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f index b9d2f0eb1..ab444894b 100644 --- a/lapack-netlib/SRC/zhetrd_2stage.f +++ b/lapack-netlib/SRC/zhetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download ZHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download ZHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX*16 A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX*16 array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX*16 array, dimension (LHOUS2) +*> HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HE2HB', -INFO ) RETURN END IF - CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HB2ST', -INFO ) @@ -324,7 +332,6 @@ END IF * * - HOUS2( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 1d39ac942..247497ab6 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the zhetrd_he2hb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the zhetrd_he2hb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the zhetrd_he2hb *> routine has been called to produce AB (e.g., AB is *> the output of zhetrd_he2hb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX*16 array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension LWORK. +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * @@ -259,11 +264,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIZEV, SIZETAU, LDV, LHMIN, LWMIN DOUBLE PRECISION ABSTMP @@ -277,7 +282,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -285,7 +290,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -294,9 +298,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -358,7 +367,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -366,11 +375,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* complex because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -385,17 +394,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = DBLE( AB( ABDPOS, I ) ) @@ -444,7 +453,7 @@ C END IF RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the hermitian band of A to a tridiagonal matrix. * THGRSIZ = N @@ -453,7 +462,7 @@ C END IF NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL ZLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -462,7 +471,7 @@ C END IF * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -471,7 +480,7 @@ C END IF #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -482,7 +491,7 @@ C END IF ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -508,17 +517,17 @@ C END IF ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -526,20 +535,20 @@ C END IF !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -554,14 +563,14 @@ C END IF !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -575,11 +584,10 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of ZHETRD_HB2ST * END - + diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e1b2e1794..3e3bfa374 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f index 78d4f71b8..433887108 100644 --- a/lapack-netlib/SRC/zhetrf.f +++ b/lapack-netlib/SRC/zhetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -227,7 +227,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * @@ -346,6 +346,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f index 537c16e8c..381c87d51 100644 --- a/lapack-netlib/SRC/zhetrf_aa.f +++ b/lapack-netlib/SRC/zhetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX*16 ALPHA * .. @@ -178,18 +180,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -202,11 +211,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = DBLE( A( 1, 1 ) ) RETURN END IF diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 477602b5e..bab13a99d 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16SYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -182,7 +182,7 @@ * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY INTEGER I, J, K, I1, I2, TD - INTEGER LDTB, NB, KB, JB, NT, IINFO + INTEGER LWKOPT, LDTB, NB, KB, JB, NT, IINFO COMPLEX*16 PIV * .. * .. External Functions .. @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * @@ -392,7 +392,7 @@ CALL ZGETRF( N-(J+1)*NB, NB, $ WORK, N, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * @@ -587,7 +587,7 @@ c END IF CALL ZGETRF( N-(J+1)*NB, NB, $ A( (J+1)*NB+1, J*NB+1 ), LDA, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f index 73dd9f9d0..7c505fa4d 100644 --- a/lapack-netlib/SRC/zhetrf_rk.f +++ b/lapack-netlib/SRC/zhetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f index e9de47248..a56349092 100644 --- a/lapack-netlib/SRC/zhetrf_rook.f +++ b/lapack-netlib/SRC/zhetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f index 384745c3a..1d932b866 100644 --- a/lapack-netlib/SRC/zhetri2.f +++ b/lapack-netlib/SRC/zhetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of ZHETRI2 diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f index 06ac1fd28..b7a1f7f07 100644 --- a/lapack-netlib/SRC/zhetrs_aa.f +++ b/lapack-netlib/SRC/zhetrs_aa.f @@ -106,7 +106,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -124,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -152,7 +158,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -162,13 +168,19 @@ EXTERNAL ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f index 4abefa434..59a0a5558 100644 --- a/lapack-netlib/SRC/zlamswlq.f +++ b/lapack-netlib/SRC/zlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,92 +192,103 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA + EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -403,7 +417,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMSWLQ diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f index 5030cb75f..03770c06e 100644 --- a/lapack-netlib/SRC/zlamtsqr.f +++ b/lapack-netlib/SRC/zlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,46 +193,50 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA + EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +244,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +265,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +422,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMTSQR diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f index be4c48539..735207132 100644 --- a/lapack-netlib/SRC/zlaswlq.f +++ b/lapack-netlib/SRC/zlaswlq.f @@ -96,22 +96,23 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +160,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL ZGELQT, ZTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +201,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -209,60 +221,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = LWMIN RETURN * * End of ZLASWLQ diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f index 231a17274..27eac839b 100644 --- a/lapack-netlib/SRC/zlatrs3.f +++ b/lapack-netlib/SRC/zlatrs3.f @@ -158,7 +158,11 @@ *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,7 +262,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -296,15 +301,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters. * @@ -326,7 +340,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f index 8c938aebc..24d00f28a 100644 --- a/lapack-netlib/SRC/zlatsqr.f +++ b/lapack-netlib/SRC/zlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL ZGEQRT, ZTPQRT, XERBLA + EXTERNAL ZGEQRT, ZTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +205,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +219,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1,CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1,CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of ZLATSQR From 201f22f49ae87f83b8970a257bc580e33cb94f30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 23:27:38 +0100 Subject: [PATCH 054/191] Fix issues related to ?GEDMD (Reference-LAPACK PR 959) --- lapack-netlib/SRC/cgedmd.f90 | 872 +++++++++++++++++++-------------- lapack-netlib/SRC/dgedmd.f90 | 920 ++++++++++++++++++++--------------- lapack-netlib/SRC/sgedmd.f90 | 916 +++++++++++++++++++--------------- lapack-netlib/SRC/zgedmd.f90 | 848 +++++++++++++++++++------------- 4 files changed, 2084 insertions(+), 1472 deletions(-) diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90 index 499489270..1413130ec 100644 --- a/lapack-netlib/SRC/cgedmd.f90 +++ b/lapack-netlib/SRC/cgedmd.f90 @@ -1,22 +1,526 @@ +!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!..... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 +! +!..... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> CGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, CGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, CGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: CGESVD (the QR SVD algorithm) +!> 2 :: CGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in,out] X +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by CGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to CGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim +!..... +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), +!> where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: CGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: CGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim +!..... +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to CGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim +!..... +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where +!> LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: CGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: CGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & M, N, X, LDX, Y, LDY, NRNK, TOL, & K, EIGS, Z, LDZ, RES, B, LDB, & W, LDW, S, LDS, ZWORK, LZWORK, & RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! !..... USE iso_fortran_env IMPLICIT NONE INTEGER, PARAMETER :: WP = real32 -!..... +! ! Scalar arguments +! ~~~~~~~~~~~~~~~~ CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & NRNK, LDZ, LDB, LDW, LDS, & LIWORK, LRWORK, LZWORK INTEGER, INTENT(OUT) :: K, INFO REAL(KIND=WP), INTENT(IN) :: TOL +! ! Array arguments +! ~~~~~~~~~~~~~~~ COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & W(LDW,*), S(LDS,*) @@ -25,364 +529,14 @@ REAL(KIND=WP), INTENT(OUT) :: RES(*) REAL(KIND=WP), INTENT(OUT) :: RWORK(*) INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! CGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, CGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, CGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ -!...................................................................... -! Arguments -! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. -!..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. -!..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. -!..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. -!..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: CGESVD (the QR SVD algorithm) -! 2 :: CGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. -!..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). -!..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). -!..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. -!..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. -!..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. -!..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. -!..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. -!..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. -!..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. -!..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. -!..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. -!..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. -!..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. -!..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. -!..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. -!..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. -!..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. -!..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by CGEEV. -! See the description of K. -!..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. -!..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to CGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. -!..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), -! where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: CGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: CGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: CGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. -!..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to CGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. -!..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where -! LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: CGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: CGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: CGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). -!..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. -!..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. -!..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. -!............................................................. -!............................................................. ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -400,7 +554,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2 @@ -408,13 +562,13 @@ INTEGER ICAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL CAXPY, CGEMM, CSSCAL EXTERNAL CGEEV, CGEJSV, CGESDD, CGESVD, CGESVDQ, & CLACPY, CLASCL, CLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC FLOAT, INT, MAX, SQRT @@ -607,7 +761,8 @@ K = 0 DO i = 1, N !WORK(i) = SCNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -680,7 +835,8 @@ ! carefully computed using CLASSQ. DO i = 1, N !RWORK(i) = SCNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90 index 20424808f..15df48fe9 100644 --- a/lapack-netlib/SRC/dgedmd.f90 +++ b/lapack-netlib/SRC/dgedmd.f90 @@ -1,424 +1,574 @@ - SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) +! !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! DGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, DGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, DGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ !............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> DGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, DGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, DGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) is CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: DGESVD (the QR SVD algorithm) -! 2 :: DGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: DGESVD (the QR SVD algorithm) +!> 2 :: DGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1). -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1). +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by DGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by DGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to DGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! leng of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to DGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> leng of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of DGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of DGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of DGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of DGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of DGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of DGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of DGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of DGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -432,10 +582,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) DLANGE, DLAMCH, DNRM2 @@ -443,13 +594,13 @@ INTEGER IDAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL DAXPY, DGEMM, DSCAL EXTERNAL DGEEV, DGEJSV, DGESDD, DGESVD, DGESVDQ, & DLACPY, DLASCL, DLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -632,7 +783,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -705,7 +857,8 @@ ! carefully computed using DLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -1051,4 +1204,3 @@ RETURN ! ...... END SUBROUTINE DGEDMD - diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90 index 49cb11527..4860e8898 100644 --- a/lapack-netlib/SRC/sgedmd.f90 +++ b/lapack-netlib/SRC/sgedmd.f90 @@ -1,423 +1,573 @@ - SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real32 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! SGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, SGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, SGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> SGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, SGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, SGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim !...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Distribution Statement A: +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!============================================================ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: SGESVD (the QR SVD algorithm) -! 2 :: SGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: SGESVD (the QR SVD algorithm) +!> 2 :: SGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! left singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> left singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by SGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by SGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to SGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! length of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to SGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> length of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of SGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of SGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of SGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of SGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of SGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of SGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of SGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of SGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -431,11 +581,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) SLANGE, SLAMCH, SNRM2 @@ -443,13 +593,13 @@ INTEGER ISAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL SAXPY, SGEMM, SSCAL EXTERNAL SGEEV, SGEJSV, SGESDD, SGESVD, SGESVDQ, & SLACPY, SLASCL, SLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC INT, FLOAT, MAX, SQRT @@ -632,7 +782,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -705,7 +856,8 @@ ! carefully computed using SLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90 index 090641ad8..5045cb166 100644 --- a/lapack-netlib/SRC/zgedmd.f90 +++ b/lapack-netlib/SRC/zgedmd.f90 @@ -1,389 +1,539 @@ - SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, EIGS, Z, LDZ, RES, B, LDB, & - W, LDW, S, LDS, ZWORK, LZWORK, & - RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 -!..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 - -!..... -! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LIWORK, LRWORK, LZWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL -! Array arguments - COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) - COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) - REAL(KIND=WP), INTENT(OUT) :: RES(*) - REAL(KIND=WP), INTENT(OUT) :: RWORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! ZGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, ZGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, ZGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== ! +! SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!...... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 +! +!...... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> ZGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, ZGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, ZGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim !............................................................ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: ZGESVD (the QR SVD algorithm) -! 2 :: ZGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: ZGESVD (the QR SVD algorithm) +!> 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in] LDX +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim !..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim !..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by ZGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by ZGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to ZGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to ZGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim !..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), -! where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: ZGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: ZGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: ZGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), +!> where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: ZGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: ZGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim !..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to ZGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to ZGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim !..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where -! LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: ZGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: ZGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: ZGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where +!> LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: ZGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: ZGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -401,7 +551,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2 @@ -409,13 +559,13 @@ INTEGER IZAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL ZAXPY, ZGEMM, ZDSCAL EXTERNAL ZGEEV, ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, & ZLACPY, ZLASCL, ZLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -608,7 +758,8 @@ K = 0 DO i = 1, N !WORK(i) = DZNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -681,7 +832,8 @@ ! carefully computed using ZLASSQ. DO i = 1, N !RWORK(i) = DZNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 From 283713e4c541d9e77b4d3922bdc40245a8354a89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 23:32:45 +0100 Subject: [PATCH 055/191] Add tests for ?GEDMD (Reference-LAPACK PR 959) --- lapack-netlib/TESTING/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index b4e2223f7..d4e6f970d 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts) add_lapack_test(sgsv.out gsv.in xeigtsts) add_lapack_test(scsd.out csd.in xeigtsts) add_lapack_test(slse.out lse.in xeigtsts) +# +# ======== SINGLE DMD EIG TESTS =========================== +add_lapack_test(sdmd.out sdmd.in xdmdeigtsts) endif() if(BUILD_DOUBLE) @@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd) add_lapack_test(dgsv.out gsv.in xeigtstd) add_lapack_test(dcsd.out csd.in xeigtstd) add_lapack_test(dlse.out lse.in xeigtstd) +# +# ======== DOUBLE DMD EIG TESTS =========================== +add_lapack_test(ddmd.out ddmd.in xdmdeigtstd) endif() if(BUILD_COMPLEX) @@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc) add_lapack_test(cgsv.out gsv.in xeigtstc) add_lapack_test(ccsd.out csd.in xeigtstc) add_lapack_test(clse.out lse.in xeigtstc) +# +# ======== COMPLEX DMD EIG TESTS =========================== +add_lapack_test(cdmd.out cdmd.in xdmdeigtstc) endif() if(BUILD_COMPLEX16) @@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz) add_lapack_test(zgsv.out gsv.in xeigtstz) add_lapack_test(zcsd.out csd.in xeigtstz) add_lapack_test(zlse.out lse.in xeigtstz) +# +# ======== COMPLEX16 DMD EIG TESTS =========================== +add_lapack_test(zdmd.out zdmd.in xdmdeigtstz) endif() From c6fa9210278da4c505aec2fab8244ad009b1f944 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 23:39:53 +0100 Subject: [PATCH 056/191] Add tests for ?GEDMD (Reference-LAPACK PR 959) --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index d252c7fa9..e7236677a 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -42,6 +42,8 @@ set(SEIGTST schkee.F sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyl01.f ssyt21.f ssyt22.f) +set(SDMDEIGTST schkdmd.f90) + set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f @@ -59,6 +61,8 @@ set(CEIGTST cchkee.F csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) +set(CDMDEIGTST cchkdmd.f90) + set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) @@ -79,6 +83,8 @@ set(DEIGTST dchkee.F dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyl01.f dsyt21.f dsyt22.f) +set(DDMDEIGTST dchkdmd.f90) + set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f @@ -96,6 +102,8 @@ set(ZEIGTST zchkee.F zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) +set(ZDMDEIGTST zchkdmd.f90) + macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) @@ -104,16 +112,20 @@ endmacro() if(BUILD_SINGLE) add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtsts ${SDMDEIGTST}) endif() if(BUILD_COMPLEX) add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstc ${CDMDEIGTST}) endif() if(BUILD_DOUBLE) add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstd ${DDMDEIGTST}) endif() if(BUILD_COMPLEX16) add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstz ${ZDMDEIGTST}) endif() From 0baf462dbca147a82db9412b521459b60e417d53 Mon Sep 17 00:00:00 2001 From: Wu Xiaotian Date: Wed, 20 Dec 2023 10:34:47 +0800 Subject: [PATCH 057/191] Fix: build failed on LoongArch According to the documentation at https://github.com/loongson/la-abi-specs/blob/release/lapcs.adoc#the-base-abi-variants, valid -mabi parameters are lp64s, lp64f, lp64d, ilp32s, ilp32f and ilp32d. --- cmake/cc.cmake | 14 ++++++++++++-- cmake/fc.cmake | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 00952e810..2da941afb 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (LOONGARCH64) if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + endif () else () - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () endif () set(BINARY_DEFINED 1) endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index c496f6368..5c30be843 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (LOONGARCH64) if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + endif () else () - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () endif () endif () if (RISCV64) From d2f4f1b28aa4c00a4bc4bc1b512f162b349cae4c Mon Sep 17 00:00:00 2001 From: Xiaotian Wu Date: Wed, 20 Dec 2023 14:13:04 +0800 Subject: [PATCH 058/191] CI: update toolchains for LoongArch64 --- .github/workflows/loongarch64.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 4a9bf98b6..42393eb0c 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -16,13 +16,13 @@ jobs: include: - target: LOONGSONGENERIC triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSONGENERIC + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC - target: LOONGSON3R5 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON3R5 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 - target: LOONGSON2K1000 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 - target: DYNAMIC_ARCH triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC @@ -40,8 +40,8 @@ jobs: - name: Download and install loongarch64-toolchain run: | - wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz - tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt + wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt - name: Set env run: | From 1106460bb3f5b6d17a77acafaf1fa7ba6051acbd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Dec 2023 12:29:56 +0100 Subject: [PATCH 059/191] remove redundant targets from the default ARM64 DYNAMIC_ARCH list --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index ebdc5a833..eb974456b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () From e7a895e7140f538d14ea5b9354d33840b76e44d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Dec 2023 12:36:05 +0100 Subject: [PATCH 060/191] Add Apple M as NeoverseN1 --- driver/others/dynamic_arm64.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index b5fb8161d..803e0b5eb 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -247,6 +247,10 @@ static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; char coremsg[128]; +#if defined (OS_DARWIN) + return &gotoblas_NEOVERSEN1; +#endif + #if (!defined OS_LINUX && !defined OS_ANDROID) return NULL; #else @@ -352,6 +356,9 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + case 0x61: // Apple + return &gotoblas_NEOVERSEN1; + break; default: snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); From c6996a80e950535306e281f7f2da54f387805d4e Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Fri, 8 Dec 2023 16:06:17 +0800 Subject: [PATCH 061/191] loongarch64: Refine amax,amin,max,min optimization. --- common_loongarch64.h | 24 +++ kernel/loongarch64/KERNEL.LOONGSON2K1000 | 16 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 16 +- kernel/loongarch64/amax_lasx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/amax_lsx.S | 231 ++++++++++++++++++++++ kernel/loongarch64/amin_lasx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/amin_lsx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/damax_lasx.S | 183 ------------------ kernel/loongarch64/damax_lsx.S | 145 -------------- kernel/loongarch64/damin_lasx.S | 178 ----------------- kernel/loongarch64/damin_lsx.S | 145 -------------- kernel/loongarch64/dmax_lasx.S | 175 ----------------- kernel/loongarch64/dmax_lsx.S | 141 -------------- kernel/loongarch64/dmin_lasx.S | 175 ----------------- kernel/loongarch64/dmin_lsx.S | 143 -------------- kernel/loongarch64/max_lasx.S | 229 ++++++++++++++++++++++ kernel/loongarch64/max_lsx.S | 228 ++++++++++++++++++++++ kernel/loongarch64/min_lasx.S | 229 ++++++++++++++++++++++ kernel/loongarch64/min_lsx.S | 228 ++++++++++++++++++++++ kernel/loongarch64/samax_lasx.S | 208 -------------------- kernel/loongarch64/samax_lsx.S | 177 ----------------- kernel/loongarch64/samin_lasx.S | 208 -------------------- kernel/loongarch64/samin_lsx.S | 177 ----------------- kernel/loongarch64/smax_lasx.S | 205 -------------------- kernel/loongarch64/smax_lsx.S | 171 ----------------- kernel/loongarch64/smin_lasx.S | 205 -------------------- kernel/loongarch64/smin_lsx.S | 174 ----------------- 27 files changed, 1881 insertions(+), 2826 deletions(-) create mode 100644 kernel/loongarch64/amax_lasx.S create mode 100644 kernel/loongarch64/amax_lsx.S create mode 100644 kernel/loongarch64/amin_lasx.S create mode 100644 kernel/loongarch64/amin_lsx.S delete mode 100644 kernel/loongarch64/damax_lasx.S delete mode 100644 kernel/loongarch64/damax_lsx.S delete mode 100644 kernel/loongarch64/damin_lasx.S delete mode 100644 kernel/loongarch64/damin_lsx.S delete mode 100644 kernel/loongarch64/dmax_lasx.S delete mode 100644 kernel/loongarch64/dmax_lsx.S delete mode 100644 kernel/loongarch64/dmin_lasx.S delete mode 100644 kernel/loongarch64/dmin_lsx.S create mode 100644 kernel/loongarch64/max_lasx.S create mode 100644 kernel/loongarch64/max_lsx.S create mode 100644 kernel/loongarch64/min_lasx.S create mode 100644 kernel/loongarch64/min_lsx.S delete mode 100644 kernel/loongarch64/samax_lasx.S delete mode 100644 kernel/loongarch64/samax_lsx.S delete mode 100644 kernel/loongarch64/samin_lasx.S delete mode 100644 kernel/loongarch64/samin_lsx.S delete mode 100644 kernel/loongarch64/smax_lasx.S delete mode 100644 kernel/loongarch64/smax_lsx.S delete mode 100644 kernel/loongarch64/smin_lasx.S delete mode 100644 kernel/loongarch64/smin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 4963b2f07..72e900f77 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -120,6 +120,10 @@ static inline int WhereAmI(void){ #define CMOVT fsel #define MTC movgr2fr.d #define FABS fabs.d +#define FMIN fmin.d +#define FMINA fmina.d +#define FMAX fmax.d +#define FMAXA fmaxa.d #define CMPEQ fcmp.ceq.d #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d @@ -128,10 +132,18 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.d #define XVFADD xvfadd.d #define XVFMADD xvfmadd.d +#define XVFMIN xvfmin.d +#define XVFMINA xvfmina.d +#define XVFMAX xvfmax.d +#define XVFMAXA xvfmaxa.d #define VFSUB vfsub.d #define VFADD vfadd.d #define VFMADD vfmadd.d +#define VFMIN vfmin.d +#define VFMINA vfmina.d +#define VFMAX vfmax.d +#define VFMAXA vfmaxa.d #else @@ -148,6 +160,10 @@ static inline int WhereAmI(void){ #define CMOVT fsel #define MTC movgr2fr.w #define FABS fabs.s +#define FMIN fmin.s +#define FMINA fmina.s +#define FMAX fmax.s +#define FMAXA fmaxa.s #define CMPEQ fcmp.ceq.s #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s @@ -156,10 +172,18 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.s #define XVFADD xvfadd.s #define XVFMADD xvfmadd.s +#define XVFMIN xvfmin.s +#define XVFMINA xvfmina.s +#define XVFMAX xvfmax.s +#define XVFMAXA xvfmaxa.s #define VFSUB vfsub.s #define VFADD vfadd.s #define VFMADD vfmadd.s +#define VFMIN vfmin.s +#define VFMINA vfmina.s +#define VFMAX vfmax.s +#define VFMAXA vfmaxa.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 1e4fa7a9d..802dd1c9b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -7,17 +7,17 @@ DDOTKERNEL = dot_lsx.S SSCALKERNEL = sscal_lsx.S DSCALKERNEL = dscal_lsx.S -SAMAXKERNEL = samax_lsx.S -DAMAXKERNEL = damax_lsx.S +SAMAXKERNEL = amax_lsx.S +DAMAXKERNEL = amax_lsx.S -SAMINKERNEL = samin_lsx.S -DAMINKERNEL = damin_lsx.S +SAMINKERNEL = amin_lsx.S +DAMINKERNEL = amin_lsx.S -SMAXKERNEL = smax_lsx.S -DMAXKERNEL = dmax_lsx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lsx.S -DMINKERNEL = dmin_lsx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S ISMAXKERNEL = ismax_lsx.S IDMAXKERNEL = idmax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f00abcb32..3253489d9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -7,17 +7,17 @@ DDOTKERNEL = dot_lasx.S SSCALKERNEL = sscal_lasx.S DSCALKERNEL = dscal_lasx.S -SAMAXKERNEL = samax_lasx.S -DAMAXKERNEL = damax_lasx.S +SAMAXKERNEL = amax_lasx.S +DAMAXKERNEL = amax_lasx.S -SAMINKERNEL = samin_lasx.S -DAMINKERNEL = damin_lasx.S +SAMINKERNEL = amin_lasx.S +DAMINKERNEL = amin_lasx.S -SMAXKERNEL = smax_lasx.S -DMAXKERNEL = dmax_lasx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lasx.S -DMINKERNEL = dmin_lasx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S ISMAXKERNEL = ismax_lasx.S IDMAXKERNEL = idmax_lasx.S diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S new file mode 100644 index 000000000..e964d4ddb --- /dev/null +++ b/kernel/loongarch64/amax_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM0, VM1 + XVFMAXA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S new file mode 100644 index 000000000..fb3b77a0e --- /dev/null +++ b/kernel/loongarch64/amax_lsx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM0, VM1 + VFMAXA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAXA VM1, VX0, VX1 + VFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S new file mode 100644 index 000000000..0a4359002 --- /dev/null +++ b/kernel/loongarch64/amin_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM0, VM1 + XVFMINA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMINA VM1, VX0, VX1 + XVFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S new file mode 100644 index 000000000..644caf43c --- /dev/null +++ b/kernel/loongarch64/amin_lsx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM0, VM1 + VFMINA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMINA VM1, VX0, VX1 + VFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + SUB $f0, $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S deleted file mode 100644 index c44ce4995..000000000 --- a/kernel/loongarch64/damax_lasx.S +++ /dev/null @@ -1,183 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define J $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define TEMP $r16 -#define m0 $xr8 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define x5 $xr13 -#define x6 $xr14 -#define x7 $xr15 -#define x8 $xr16 -#define VX0 $xr20 -#define VX1 $xr21 -#define VM0 $xr22 -#define VM1 $xr23 -#define VM2 $xr18 -#define VM3 $xr19 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - srai.d I, N, 3 - bge $r0, I, .L12 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - addi.d I, I, -1 - xvfmaxa.d VM1, VX1, VX0 - addi.d X, X, 8 * SIZE - xvfmaxa.d VM0, VM0, VM1 - blt $r0, I, .L10 - .align 3 - -.L11: - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfmaxa.d VM2, x3, x4 - xvfmaxa.d VM0, VM1, VM2 - .align 3 - -.L12: //INCX==1 and N<8 - andi I, N, 7 - li.d J, 4 - bge J, I, .L13 // 4 Date: Wed, 27 Dec 2023 11:30:17 +0800 Subject: [PATCH 062/191] loongarch64: Refine copy,swap,nrm2,sum optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 12 +- kernel/loongarch64/copy_lasx.S | 306 ++++++++++++++++ kernel/loongarch64/copy_lsx.S | 316 +++++++++++++++++ kernel/loongarch64/dcopy_lasx.S | 224 ------------ kernel/loongarch64/dcopy_lsx.S | 232 ------------ kernel/loongarch64/dnrm2_lasx.S | 124 ++++--- kernel/loongarch64/dnrm2_lsx.S | 84 +++-- kernel/loongarch64/dsum_lasx.S | 125 ------- kernel/loongarch64/dsum_lsx.S | 123 ------- kernel/loongarch64/dswap_lasx.S | 301 ---------------- kernel/loongarch64/dswap_lsx.S | 317 ----------------- kernel/loongarch64/scopy_lasx.S | 216 ------------ kernel/loongarch64/scopy_lsx.S | 220 ------------ kernel/loongarch64/snrm2_lasx.S | 116 +++--- kernel/loongarch64/snrm2_lsx.S | 141 ++++---- kernel/loongarch64/ssum_lasx.S | 140 -------- kernel/loongarch64/ssum_lsx.S | 125 ------- kernel/loongarch64/sswap_lasx.S | 286 --------------- kernel/loongarch64/sswap_lsx.S | 294 ---------------- kernel/loongarch64/sum_lasx.S | 225 ++++++++++++ kernel/loongarch64/sum_lsx.S | 204 +++++++++++ kernel/loongarch64/swap_lasx.S | 401 +++++++++++++++++++++ kernel/loongarch64/swap_lsx.S | 431 +++++++++++++++++++++++ 24 files changed, 2159 insertions(+), 2816 deletions(-) create mode 100644 kernel/loongarch64/copy_lasx.S create mode 100644 kernel/loongarch64/copy_lsx.S delete mode 100644 kernel/loongarch64/dcopy_lasx.S delete mode 100644 kernel/loongarch64/dcopy_lsx.S delete mode 100644 kernel/loongarch64/dsum_lasx.S delete mode 100644 kernel/loongarch64/dsum_lsx.S delete mode 100644 kernel/loongarch64/dswap_lasx.S delete mode 100644 kernel/loongarch64/dswap_lsx.S delete mode 100644 kernel/loongarch64/scopy_lasx.S delete mode 100644 kernel/loongarch64/scopy_lsx.S delete mode 100644 kernel/loongarch64/ssum_lasx.S delete mode 100644 kernel/loongarch64/ssum_lsx.S delete mode 100644 kernel/loongarch64/sswap_lasx.S delete mode 100644 kernel/loongarch64/sswap_lsx.S create mode 100644 kernel/loongarch64/sum_lasx.S create mode 100644 kernel/loongarch64/sum_lsx.S create mode 100644 kernel/loongarch64/swap_lasx.S create mode 100644 kernel/loongarch64/swap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 802dd1c9b..cb230b348 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lsx.S ISAMINKERNEL = isamin_lsx.S IDAMINKERNEL = idamin_lsx.S -SCOPYKERNEL = scopy_lsx.S -DCOPYKERNEL = dcopy_lsx.S +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S -SSWAPKERNEL = sswap_lsx.S -DSWAPKERNEL = dswap_lsx.S +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S SAXPYKERNEL = saxpy_lsx.S DAXPYKERNEL = daxpy_lsx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S -SSUMKERNEL = ssum_lsx.S -DSUMKERNEL = dsum_lsx.S +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3253489d9..ba59c4566 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lasx.S ISAMINKERNEL = isamin_lasx.S IDAMINKERNEL = idamin_lasx.S -SCOPYKERNEL = scopy_lasx.S -DCOPYKERNEL = dcopy_lasx.S +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S -SSWAPKERNEL = sswap_lasx.S -DSWAPKERNEL = dswap_lasx.S +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S SAXPYKERNEL = saxpy_lasx.S DAXPYKERNEL = daxpy_lasx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S -SSUMKERNEL = ssum_lasx.S -DSUMKERNEL = dsum_lasx.S +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S deleted file mode 100644 index 9d7da4a80..000000000 --- a/kernel/loongarch64/dcopy_lasx.S +++ /dev/null @@ -1,224 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvst VX0, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S deleted file mode 100644 index 161655bbd..000000000 --- a/kernel/loongarch64/dcopy_lsx.S +++ /dev/null @@ -1,232 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - addi.d I, I, -1 - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 0 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 2 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 4 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S index 2a9c3cf7b..5a6f7cf1e 100644 --- a/kernel/loongarch64/dnrm2_lasx.S +++ b/kernel/loongarch64/dnrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VM0 $xr17 @@ -35,6 +69,7 @@ xvxor.v res1, res1, res1 xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,12 +81,11 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - xvld VM0, X, 0 bge $r0, I, .L97 .align 3 .L10: - xvld VX0, X, 0 * SIZE + xvld VX0, X, 0 xvld VX1, X, 4 * SIZE xvfmaxa.d VM1, VX1, VX0 xvfmaxa.d VM0, VM0, VM1 @@ -62,40 +96,32 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 .align 3 .L21: - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t4, 3 xvfmaxa.d VM1, VX0, VX1 @@ -109,9 +135,9 @@ xvpickve.d VX0, VM0, 1 xvpickve.d VX1, VM0, 2 xvpickve.d VM3, VM0, 3 - xvfmaxa.d VM1, VX0, VX1 - xvfmaxa.d VM2, VM3, VM0 - xvfmaxa.d VM0, VM1, VM2 + fmaxa.d $f17, $f17, $f14 + fmaxa.d $f17, $f17, $f15 + fmaxa.d $f17, $f17, $f16 .align 3 .L97: @@ -149,12 +175,12 @@ .align 3 .L110: - xvld VX0, XX, 0 * SIZE + xvld VX0, XX, 0 xvld VX1, XX, 4 * SIZE - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -166,34 +192,34 @@ bge $r0, I, .L997 .L121: - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX xvinsgr2vr.d VX0, t1, 0 xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 xvinsgr2vr.d VX1, t3, 2 xvinsgr2vr.d VX1, t4, 3 - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -203,10 +229,10 @@ xvfadd.d res1, res1, res2 xvpickve.d VX0, res1, 1 xvpickve.d VX1, res1, 2 - xvpickve.d VM0, res1, 3 - xvfadd.d res1, VX0, res1 - xvfadd.d VX1, VX1, VM0 - xvfadd.d res1, VX1, res1 + xvpickve.d VM2, res1, 3 + fadd.d $f19, $f19, $f15 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f13 .align 3 .L997: @@ -215,19 +241,17 @@ .align 3 .L998: - fld.d $f15, XX, 0 * SIZE + fld.d $f15, XX, 0 addi.d I, I, -1 fmul.d $f15, $f15, ALPHA fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 - .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 + .align 3 EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S index e4615e18d..fce4260e2 100644 --- a/kernel/loongarch64/dnrm2_lsx.S +++ b/kernel/loongarch64/dnrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $vr15 #define VX1 $vr16 #define VM0 $vr17 @@ -35,6 +69,7 @@ vxor.v res1, res1, res1 vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,7 +81,7 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - vld VM0, X, 0 + bge $r0, I, .L97 .align 3 @@ -66,15 +101,7 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 .align 3 .L21: @@ -154,16 +181,16 @@ .L110: vld VX0, XX, 0 * SIZE vld VX1, XX, 2 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 vld VX0, XX, 4 * SIZE vld VX1, XX, 6 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -173,6 +200,7 @@ .L120: srai.d I, N, 3 bge $r0, I, .L997 + .align 3 .L121: ld.d t1, XX, 0 * SIZE @@ -187,14 +215,14 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA + vfmul.d VM2, VX0, VALPHA ld.d t1, XX, 0 * SIZE add.d XX, XX, INCX - vfmul.d VM1, VX1, VALPHA + vfmul.d VM3, VX1, VALPHA ld.d t2, XX, 0 * SIZE add.d XX, XX, INCX - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 ld.d t3, XX, 0 * SIZE add.d XX, XX, INCX ld.d t4, XX, 0 * SIZE @@ -203,10 +231,10 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -230,13 +258,11 @@ fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S deleted file mode 100644 index 3c51dab60..000000000 --- a/kernel/loongarch64/dsum_lasx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fadd.d $f16, $f12, $f16 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S deleted file mode 100644 index 402d087df..000000000 --- a/kernel/loongarch64/dsum_lsx.S +++ /dev/null @@ -1,123 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S deleted file mode 100644 index 221cb7fa2..000000000 --- a/kernel/loongarch64/dswap_lasx.S +++ /dev/null @@ -1,301 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvld VX2, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 3 - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 3 - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - xvstelm.d VX2, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX2, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX2, X, 0, 3 - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvst VX0, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX3, X, 0, 0 - add.d X, X, INCY - ld.d t2, X, 0 * SIZE - xvstelm.d VX3, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX3, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX3, X, 0, 3 - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S deleted file mode 100644 index 7f7f585e1..000000000 --- a/kernel/loongarch64/dswap_lsx.S +++ /dev/null @@ -1,317 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX2, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - addi.d I, I, -1 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 4 * SIZE - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 2 * SIZE - vld VX2, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S deleted file mode 100644 index 7db1e7cee..000000000 --- a/kernel/loongarch64/scopy_lasx.S +++ /dev/null @@ -1,216 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 7 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvst VX0, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S deleted file mode 100644 index 32150d3d6..000000000 --- a/kernel/loongarch64/scopy_lsx.S +++ /dev/null @@ -1,220 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vst VX0, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S index 274908c14..3ae11e897 100644 --- a/kernel/loongarch64/snrm2_lasx.S +++ b/kernel/loongarch64/snrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -11,10 +43,13 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VX2 $xr17 #define VX3 $xr18 +#define VX4 $xr21 #define res1 $xr19 #define res2 $xr20 @@ -37,14 +72,13 @@ .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 0 * SIZE - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvld VX0, X, 0 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE blt $r0, I, .L10 .align 3 b .L996 @@ -54,70 +88,46 @@ .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX xvinsgr2vr.w VX0, t1, 0 xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX + ld.w t4, X, 0 xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 0 - xvinsgr2vr.w VX1, t2, 1 - xvinsgr2vr.w VX1, t3, 2 - xvinsgr2vr.w VX1, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 4 - xvinsgr2vr.w VX1, t2, 5 - xvinsgr2vr.w VX1, t3, 6 - xvinsgr2vr.w VX1, t4, 7 - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 .L996: xvfadd.d res1, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f17 + fadd.d $f19, $f19, $f18 .align 3 .L997: @@ -126,11 +136,11 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE - addi.d I, I, -1 + fld.s $f15, X, 0 + add.d X, X, INCX + addi.d I, I, -1 fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 - add.d X, X, INCX + fmadd.d $f19, $f15, $f15, $f19 blt $r0, I, .L998 .align 3 diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S index 17d017900..bb492dbf0 100644 --- a/kernel/loongarch64/snrm2_lsx.S +++ b/kernel/loongarch64/snrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -15,6 +47,9 @@ #define VX1 $vr16 #define VX2 $vr17 #define VX3 $vr18 +#define VX4 $vr21 +#define VX5 $vr22 +/* Don't change following FR unless you know the effects. */ #define res1 $vr19 #define res2 $vr20 @@ -24,99 +59,71 @@ LDINT N, 0(N) LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 vxor.v res2, res2, res2 - bge $r0, N, .L999 + bge $r0, N, .L999 beq $r0, INCX, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - bge $r0, I, .L997 + bge $r0, I, .L997 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 0 * SIZE - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - vld VX2, X, 4 * SIZE - vld VX3, X, 4 * SIZE - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vld VX0, X, 0 + vld VX5, X, 4 * SIZE addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfcvtl.d.s VX3, VX5 + vfcvth.d.s VX4, VX5 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 blt $r0, I, .L10 b .L996 .align 3 - .L20: bge $r0, I, .L997 .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX vinsgr2vr.w VX0, t1, 0 vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - ld.w t1, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 @@ -124,12 +131,8 @@ .L996: vfadd.d res1, res1, res2 - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 .align 3 .L997: @@ -138,7 +141,7 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE + fld.s $f15, X, 0 addi.d I, I, -1 fcvt.d.s $f15, $f15 fmadd.d $f19, $f15, $f15, $f19 diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S deleted file mode 100644 index 7cf57bc77..000000000 --- a/kernel/loongarch64/ssum_lasx.S +++ /dev/null @@ -1,140 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvfadd.s res1, VX0, res1 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfadd.s res1, VX0, res1 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S deleted file mode 100644 index de63c69e3..000000000 --- a/kernel/loongarch64/ssum_lsx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S deleted file mode 100644 index 7184eff45..000000000 --- a/kernel/loongarch64/sswap_lasx.S +++ /dev/null @@ -1,286 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 3 - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 7 - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 3 - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 4 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 5 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 6 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 7 - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S deleted file mode 100644 index 4f19a8024..000000000 --- a/kernel/loongarch64/sswap_lsx.S +++ /dev/null @@ -1,294 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - addi.d I, I, -1 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 3 - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 3 - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - vstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - vstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX2, X, 0, 3 - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vst VX0, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX3, X, 0, 0 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - vstelm.w VX3, X, 0, 1 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - vstelm.w VX3, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX3, X, 0, 3 - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S new file mode 100644 index 000000000..fd6d5adb3 --- /dev/null +++ b/kernel/loongarch64/sum_lasx.S @@ -0,0 +1,225 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 + xvfadd.s res1, res1, VX0 +#ifdef DOUBLE + xvld VX1, X, 32 + xvfadd.s res1, res1, VX1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S new file mode 100644 index 000000000..6b2027781 --- /dev/null +++ b/kernel/loongarch64/sum_lsx.S @@ -0,0 +1,204 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 + vld VX1, X, 16 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, res1, VX1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S new file mode 100644 index 000000000..4767fffe3 --- /dev/null +++ b/kernel/loongarch64/swap_lasx.S @@ -0,0 +1,401 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + xvld VX2, Y, 0 + addi.d I, I, -1 + xvst VX2, X, 0 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvld VX2, Y, 32 + xvst VX2, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + ld.d t1, Y, 0 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX0, Y, 0, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX1, Y, 0, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX0, X, 0 + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 + ld.d t1, X, 0 + xvstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX2, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX2, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX2, X, 0, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 + xvstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX3, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX3, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX3, X, 0, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 4 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 5 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 6 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S new file mode 100644 index 000000000..736187f93 --- /dev/null +++ b/kernel/loongarch64/swap_lsx.S @@ -0,0 +1,431 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and incy==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, Y, 0 + vld VX3, Y, 16 + addi.d I, I, -1 + vst VX2, X, 0 + vst VX3, X, 16 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vld VX2, Y, 32 + vld VX3, Y, 48 + vst VX2, X, 32 + vst VX3, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: +#ifdef DOUBLE + fld.d $f12, X, 0 + fld.d $f14, Y, 0 + addi.d I, I, -1 + fst.d $f12, Y, 0 + fst.d $f14, X, 0 +#else + fld.s $f12, X, 0 + fld.s $f14, Y, 0 + addi.d I, I, -1 + fst.s $f12, Y, 0 + fst.s $f14, X, 0 +#endif + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 + ld.w t1, Y, 0 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX0, Y, 0, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX1, Y, 0, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE +#else + vld VX2, Y, 0 + ld.w t1, X, 0 + vstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX2, X, 0, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 + vstelm.w VX3, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX3, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX3, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX3, X, 0, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + ST $f12, Y, 0 * SIZE + ST $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 154baad454647fdd6d71e2c907285859718da22e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 27 Dec 2023 16:04:33 +0800 Subject: [PATCH 063/191] loongarch64: Refine iamin optimization. --- common_loongarch64.h | 10 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- .../{isamin_lasx.S => iamin_lasx.S} | 270 +++++++---- kernel/loongarch64/iamin_lsx.S | 446 ++++++++++++++++++ kernel/loongarch64/idamin_lasx.S | 275 ----------- kernel/loongarch64/idamin_lsx.S | 228 --------- kernel/loongarch64/isamin_lsx.S | 275 ----------- 8 files changed, 649 insertions(+), 863 deletions(-) rename kernel/loongarch64/{isamin_lasx.S => iamin_lasx.S} (54%) create mode 100644 kernel/loongarch64/iamin_lsx.S delete mode 100644 kernel/loongarch64/idamin_lasx.S delete mode 100644 kernel/loongarch64/idamin_lsx.S delete mode 100644 kernel/loongarch64/isamin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 72e900f77..846fc0dbd 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -119,6 +119,7 @@ static inline int WhereAmI(void){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d #define FMIN fmin.d #define FMINA fmina.d @@ -136,6 +137,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.d #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLT xvfcmp.clt.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -144,6 +147,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.d #define VFMAX vfmax.d #define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLT vfcmp.clt.d #else @@ -159,6 +164,7 @@ static inline int WhereAmI(void){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s #define FMIN fmin.s #define FMINA fmina.s @@ -176,6 +182,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.s #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLT xvfcmp.clt.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -184,6 +192,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.s #define VFMAX vfmax.s #define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLT vfcmp.clt.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index cb230b348..4eae2e4f9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S -ISAMINKERNEL = isamin_lsx.S -IDAMINKERNEL = idamin_lsx.S +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index ba59c4566..e7e1b5d5a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S -ISAMINKERNEL = isamin_lasx.S -IDAMINKERNEL = idamin_lasx.S +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/iamin_lasx.S similarity index 54% rename from kernel/loongarch64/isamin_lasx.S rename to kernel/loongarch64/iamin_lasx.S index cbdf32530..6ea117907 100644 --- a/kernel/loongarch64/isamin_lasx.S +++ b/kernel/loongarch64/iamin_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,25 +128,49 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 +#else addi.d I, I, -1 - xvadd.w VI1, VI1, VINC8 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 +#endif + XVCMPEQ VT0, VM0, VM1 addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 +#else xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 + xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 + xvor.v VX1, VM0, VX1 xvpickve.w VI1, VI0, 0 xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 @@ -105,28 +181,62 @@ xvpickve.w x4, VM0, 3 xvfmina.s VM1, x1, x2 xvfcmp.ceq.s VT0, x1, VM1 +#endif xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMINA VM0, x4, x3 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMINA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX xvinsgr2vr.w VM0, t1, 0 srai.d I, N, 3 @@ -186,9 +296,43 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -213,73 +357,42 @@ xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - xvadd.w VI1, VI1, VINC8 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 xvfcmp.ceq.s VT0, VM1, VM0 +#endif addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L24 .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfmina.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else fmov.s $f16, $f20 .align 3 @@ -306,35 +419,28 @@ xvfmina.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -346,9 +452,11 @@ xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 +#endif + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 srai.d i1, N, 3 slli.d i1, i1, 3 addi.d i1, i1, 1 //current index @@ -357,17 +465,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmina.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX + XVFMINA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +483,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S new file mode 100644 index 000000000..ce885fd88 --- /dev/null +++ b/kernel/loongarch64/iamin_lsx.S @@ -0,0 +1,446 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#ifdef DOUBLE +#define VINC2 $vr17 +#define VINC4 $vr18 +#else +#define VINC4 $vr17 +#define VINC8 $vr18 +#endif +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 +#else + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v x2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 +#endif + VCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vreplvei.d VI1, VI0, 0 + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 +#endif + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + VFMINA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + MTC $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S deleted file mode 100644 index 6ef1e8903..000000000 --- a/kernel/loongarch64/idamin_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d I, I, -1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmina.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S deleted file mode 100644 index 9eb9d883f..000000000 --- a/kernel/loongarch64/idamin_lsx.S +++ /dev/null @@ -1,228 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmina.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - vfmina.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S deleted file mode 100644 index 598888660..000000000 --- a/kernel/loongarch64/isamin_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmina.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 8be26541933b36c6e3e8002c44002efb02033bdd Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 10:24:24 +0800 Subject: [PATCH 064/191] loongarch64: Refine imax optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/idmax_lasx.S | 273 ----------- kernel/loongarch64/idmax_lsx.S | 225 --------- .../loongarch64/{ismax_lasx.S => imax_lasx.S} | 230 ++++++++-- kernel/loongarch64/imax_lsx.S | 428 ++++++++++++++++++ kernel/loongarch64/ismax_lsx.S | 272 ----------- 7 files changed, 626 insertions(+), 810 deletions(-) delete mode 100644 kernel/loongarch64/idmax_lasx.S delete mode 100644 kernel/loongarch64/idmax_lsx.S rename kernel/loongarch64/{ismax_lasx.S => imax_lasx.S} (57%) create mode 100644 kernel/loongarch64/imax_lsx.S delete mode 100644 kernel/loongarch64/ismax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 4eae2e4f9..346f1fb45 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -19,8 +19,8 @@ DMAXKERNEL = max_lsx.S SMINKERNEL = min_lsx.S DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lsx.S -IDMAXKERNEL = idmax_lsx.S +ISMAXKERNEL = imax_lsx.S +IDMAXKERNEL = imax_lsx.S ISMINKERNEL = ismin_lsx.S IDMINKERNEL = idmin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e7e1b5d5a..6b4df2d61 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -19,8 +19,8 @@ DMAXKERNEL = max_lsx.S SMINKERNEL = min_lsx.S DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lasx.S -IDMAXKERNEL = idmax_lasx.S +ISMAXKERNEL = imax_lasx.S +IDMAXKERNEL = imax_lasx.S ISMINKERNEL = ismin_lasx.S IDMINKERNEL = idmin_lasx.S diff --git a/kernel/loongarch64/idmax_lasx.S b/kernel/loongarch64/idmax_lasx.S deleted file mode 100644 index bbfe0941a..000000000 --- a/kernel/loongarch64/idmax_lasx.S +++ /dev/null @@ -1,273 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f15, $f9 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmax_lsx.S b/kernel/loongarch64/idmax_lsx.S deleted file mode 100644 index 1b4734bab..000000000 --- a/kernel/loongarch64/idmax_lsx.S +++ /dev/null @@ -1,225 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x1, x3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, VM0, x1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x1, x3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, VM0, x1 - addi.d I, I, -1 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfcmp.clt.d VT0, x1, x2 - vbitsel.v VM0, x1, x2, VT0 - vbitsel.v VI0, VI1, VI2, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f15, $f9 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismax_lasx.S b/kernel/loongarch64/imax_lasx.S similarity index 57% rename from kernel/loongarch64/ismax_lasx.S rename to kernel/loongarch64/imax_lasx.S index 843dd6c6a..2d3d5e9d3 100644 --- a/kernel/loongarch64/ismax_lasx.S +++ b/kernel/loongarch64/imax_lasx.S @@ -1,3 +1,29 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ #define ASSEMBLER #include "common.h" @@ -39,6 +65,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,20 +127,47 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else xvadd.w VI1, VI1, VINC8 xvfcmp.clt.s VT0, VM0, VX0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -103,28 +181,33 @@ xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 xvfcmp.clt.s VT0, x1, x2 +#endif xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 + XVCMPLT VT0, x3, x4 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 + XVCMPLT VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 - .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -143,11 +226,38 @@ ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX ld.w t2, TEMP, 0 * SIZE +#endif add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE + ld.d t3, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE + ld.d t4, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX +#ifdef DOUBLE + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else xvinsgr2vr.w VM0, t1, 4 xvinsgr2vr.w VM0, t2, 5 xvinsgr2vr.w VM0, t3, 6 @@ -186,9 +296,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -218,10 +365,21 @@ addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -230,57 +388,56 @@ xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x1, x2 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 + XVCMPLT VT0, x3, x4 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 + XVCMPLT VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifndef DOUBLE .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -343,6 +500,7 @@ fsel $f15, $f15, $f13, $fcc0 fsel $f20, $f20, $f16, $fcc0 movfr2gr.s i0, $f20 +#endif .L21: //N<8 andi I, N, 7 @@ -357,14 +515,14 @@ .L22: fld.d $f9, X, 0 addi.d I, I, -1 - fcmp.clt.s $fcc0, $f15, $f9 + CMPLT $fcc0, $f15, $f9 add.d X, X, INCX fsel $f15, $f15, $f9, $fcc0 fsel $f20, $f20, $f21, $fcc0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -372,4 +530,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S new file mode 100644 index 000000000..92556d4e6 --- /dev/null +++ b/kernel/loongarch64/imax_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x1, x2 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ismax_lsx.S b/kernel/loongarch64/ismax_lsx.S deleted file mode 100644 index 33b326bbd..000000000 --- a/kernel/loongarch64/ismax_lsx.S +++ /dev/null @@ -1,272 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x1, x2 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x3, x4 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x1, x2 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x3, x4 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.s $fcc0, $f15, $f9 - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 116aee7527935a51b288a15b7feffc6ea2313e8a Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 15:17:28 +0800 Subject: [PATCH 065/191] loongarch64: Refine imin optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/idmin_lasx.S | 272 ----------- kernel/loongarch64/idmin_lsx.S | 225 --------- .../loongarch64/{ismin_lasx.S => imin_lasx.S} | 266 ++++++++--- kernel/loongarch64/imin_lsx.S | 428 ++++++++++++++++++ kernel/loongarch64/ismin_lsx.S | 271 ----------- 7 files changed, 645 insertions(+), 825 deletions(-) delete mode 100644 kernel/loongarch64/idmin_lasx.S delete mode 100644 kernel/loongarch64/idmin_lsx.S rename kernel/loongarch64/{ismin_lasx.S => imin_lasx.S} (54%) create mode 100644 kernel/loongarch64/imin_lsx.S delete mode 100644 kernel/loongarch64/ismin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 346f1fb45..9164f28ef 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -22,8 +22,8 @@ DMINKERNEL = min_lsx.S ISMAXKERNEL = imax_lsx.S IDMAXKERNEL = imax_lsx.S -ISMINKERNEL = ismin_lsx.S -IDMINKERNEL = idmin_lsx.S +ISMINKERNEL = imin_lsx.S +IDMINKERNEL = imin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 6b4df2d61..5882b8932 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -22,8 +22,8 @@ DMINKERNEL = min_lsx.S ISMAXKERNEL = imax_lasx.S IDMAXKERNEL = imax_lasx.S -ISMINKERNEL = ismin_lasx.S -IDMINKERNEL = idmin_lasx.S +ISMINKERNEL = imin_lasx.S +IDMINKERNEL = imin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S diff --git a/kernel/loongarch64/idmin_lasx.S b/kernel/loongarch64/idmin_lasx.S deleted file mode 100644 index 7930d4963..000000000 --- a/kernel/loongarch64/idmin_lasx.S +++ /dev/null @@ -1,272 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x2, x1 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x4, x3 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x2, x1 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x4, x3 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f9, $f15 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmin_lsx.S b/kernel/loongarch64/idmin_lsx.S deleted file mode 100644 index 8b6edcbf0..000000000 --- a/kernel/loongarch64/idmin_lsx.S +++ /dev/null @@ -1,225 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x3, x1 - addi.d X, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, x1, VM0 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x3, x1 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, x1, VM0 - addi.d I, I, -1 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfcmp.clt.d VT0, x2, x1 - vbitsel.v VM0, x1, x2, VT0 - vbitsel.v VI0, VI1, VI2, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f9, $f15 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismin_lasx.S b/kernel/loongarch64/imin_lasx.S similarity index 54% rename from kernel/loongarch64/ismin_lasx.S rename to kernel/loongarch64/imin_lasx.S index 15f6e2ec9..5306828e2 100644 --- a/kernel/loongarch64/ismin_lasx.S +++ b/kernel/loongarch64/imin_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,20 +128,45 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else xvadd.w VI1, VI1, VINC8 - xvfcmp.clt.s VT0, VX0, VM0 + XVCMPLT VT0, VX0, VM0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -98,32 +175,67 @@ xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x2, x1 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM1, VM0 + XVCMPLT VT0, VM1, VM0 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -185,9 +297,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -213,73 +362,83 @@ xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 xvadd.w VI1, VI1, VINC8 - xvfcmp.clt.s VT0, VX0, VM0 + XVCMPLT VT0, VX0, VM0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 + xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 + xvor.v VX1, VM0, VX1 xvpickve.w VI1, VI0, 0 xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x2, x1 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM1, VM0 + XVCMPLT VT0, VM1, VM0 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + MTG i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifndef DOUBLE .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -294,13 +453,13 @@ xvpickve.w x2, VM0, 5 xvpickve.w x3, VM0, 6 xvpickve.w x4, VM0, 7 - xvfcmp.clt.s VT0, x2, x1 + XVCMPLT VT0, x2, x1 xvbitsel.v x1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, x1, VM0 + XVCMPLT VT0, x1, VM0 xvbitsel.v VM0, VM0, x1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 li.d TEMP, 1 //处理尾数相等时取最小序号 @@ -309,7 +468,7 @@ xvfcmp.ceq.s VT0, VM0, x1 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 @@ -317,7 +476,7 @@ xvfcmp.ceq.s VT0, VM0, x2 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L272 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 @@ -325,7 +484,7 @@ xvfcmp.ceq.s VT0, VM0, x3 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L282 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 @@ -333,17 +492,18 @@ xvfcmp.ceq.s VT0, VM0, x4 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L292 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L292: - fcmp.clt.s $fcc0, $f13, $f15 + CMPLT $fcc0, $f13, $f15 fsel $f15, $f15, $f13, $fcc0 fsel $f20, $f20, $f16, $fcc0 - movfr2gr.s i0, $f20 + MTG i0, $f20 +#endif -.L21: //N<8 +.L21: //N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -356,14 +516,14 @@ .L22: fld.d $f9, X, 0 addi.d I, I, -1 - fcmp.clt.s $fcc0, $f9, $f15 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX fsel $f15, $f15, $f9, $fcc0 fsel $f20, $f20, $f21, $fcc0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 - add.d X, X, INCX blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -371,4 +531,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S new file mode 100644 index 000000000..a0c411e7a --- /dev/null +++ b/kernel/loongarch64/imin_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + addi.d X, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x2, x1 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ismin_lsx.S b/kernel/loongarch64/ismin_lsx.S deleted file mode 100644 index f90ebbd57..000000000 --- a/kernel/loongarch64/ismin_lsx.S +++ /dev/null @@ -1,271 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM1, VM0 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x2, x1 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x4, x3 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x2, x1 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x4, x3 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, x2, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, x3, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, x4, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - fcmp.clt.s $fcc0, $f9, $f15 - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d I, I, -1 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From ea70e165c71201e46961c479e53c17d4034290f8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 20:07:59 +0800 Subject: [PATCH 066/191] loongarch64: Refine rot optimization. --- common_loongarch64.h | 10 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/drot_lsx.S | 1050 ---------- .../loongarch64/{drot_lasx.S => rot_lasx.S} | 779 ++++++- kernel/loongarch64/rot_lsx.S | 1791 +++++++++++++++++ kernel/loongarch64/srot_lasx.S | 863 -------- kernel/loongarch64/srot_lsx.S | 927 --------- 8 files changed, 2532 insertions(+), 2896 deletions(-) delete mode 100644 kernel/loongarch64/drot_lsx.S rename kernel/loongarch64/{drot_lasx.S => rot_lasx.S} (52%) create mode 100644 kernel/loongarch64/rot_lsx.S delete mode 100644 kernel/loongarch64/srot_lasx.S delete mode 100644 kernel/loongarch64/srot_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 846fc0dbd..13514d6e0 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -129,6 +129,7 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d +#define FFINT ffint.d.l #define XVFSUB xvfsub.d #define XVFADD xvfadd.d @@ -139,6 +140,8 @@ static inline int WhereAmI(void){ #define XVFMAXA xvfmaxa.d #define XVCMPEQ xvfcmp.ceq.d #define XVCMPLT xvfcmp.clt.d +#define XVMUL xvfmul.d +#define XVMSUB xvfmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -149,6 +152,8 @@ static inline int WhereAmI(void){ #define VFMAXA vfmaxa.d #define VCMPEQ vfcmp.ceq.d #define VCMPLT vfcmp.clt.d +#define VMUL vfmul.d +#define VMSUB vfmsub.d #else @@ -174,6 +179,7 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s +#define FFINT ffint.s.l #define XVFSUB xvfsub.s #define XVFADD xvfadd.s @@ -184,6 +190,8 @@ static inline int WhereAmI(void){ #define XVFMAXA xvfmaxa.s #define XVCMPEQ xvfcmp.ceq.s #define XVCMPLT xvfcmp.clt.s +#define XVMUL xvfmul.s +#define XVMSUB xvfmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -194,6 +202,8 @@ static inline int WhereAmI(void){ #define VFMAXA vfmaxa.s #define VCMPEQ vfcmp.ceq.s #define VCMPLT vfcmp.clt.s +#define VMUL vfmul.s +#define VMSUB vfmsub.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 9164f28ef..b315c81f2 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,8 +49,8 @@ DSUMKERNEL = sum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S -SROTKERNEL = srot_lsx.S -DROTKERNEL = drot_lsx.S +SROTKERNEL = rot_lsx.S +DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 5882b8932..577f6316e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,8 +49,8 @@ DSUMKERNEL = sum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S -SROTKERNEL = srot_lasx.S -DROTKERNEL = drot_lasx.S +SROTKERNEL = rot_lasx.S +DROTKERNEL = rot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S diff --git a/kernel/loongarch64/drot_lsx.S b/kernel/loongarch64/drot_lsx.S deleted file mode 100644 index 6db803b1c..000000000 --- a/kernel/loongarch64/drot_lsx.S +++ /dev/null @@ -1,1050 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VT0 $vr10 -#define VT1 $vr18 -#define VXC $vr23 -#define VXS $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.d.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, C - vreplgr2vr.d VXC, t1 - movfr2gr.d t2, S - vreplgr2vr.d VXS, t2 - movfr2gr.d t3, a1 - vreplgr2vr.d VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE - vst VXZ, X, 2 * SIZE - vst VXZ, Y, 2 * SIZE - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE - vst VXZ, X, 6 * SIZE - vst VXZ, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, X, 4 * SIZE - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VXS, VX0 - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VXS, VX0 - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXS, VX2 - vfmul.d VT1, VXS, VX0 - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXS, VX2 - vfmul.d VT1, VXS, VX0 - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE - fmul.d $f10, $f12, C - fmadd.d $f10, $f13, S, $f10 - fst.d $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.d $f20, $f12, S - fmsub.d $f20, $f13, C, $f20 - fst.d $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/drot_lasx.S b/kernel/loongarch64/rot_lasx.S similarity index 52% rename from kernel/loongarch64/drot_lasx.S rename to kernel/loongarch64/rot_lasx.S index d3644b780..5d7e3d7cc 100644 --- a/kernel/loongarch64/drot_lasx.S +++ b/kernel/loongarch64/rot_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -33,16 +60,25 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE movfr2gr.d t1, C xvreplgr2vr.d VXC, t1 movfr2gr.d t2, S xvreplgr2vr.d VXS, t2 movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + xvreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + xvreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -53,15 +89,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L112 // C!=0 S==0 b .L111 // C!=0 S!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L114 // C==0 S==0 b .L113 // C==0 S!=0 .align 3 @@ -69,20 +105,24 @@ .L111: // C!=0 S!=0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -93,16 +133,20 @@ .L112: // C!=0 S==0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC +#endif + XVMUL VT0, VX0, VXC + XVMUL VT1, VX2, VXC xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVMUL VT1, VX3, VXC xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,18 +157,22 @@ .L113: // C==0 S!=0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 +#endif + XVMUL VT0, VX2, VXS + XVMUL VT1, VX0, VXS + XVFSUB VT1, VXZ, VT1 xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS +#ifdef DOUBLE + XVMUL VT0, VX3, VXS + XVMUL VT1, VX1, VXS xvfsub.d VT1, VXZ, VT1 xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -135,8 +183,10 @@ .L114: // C==0 S==0 xvst VXZ, X, 0 * SIZE xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, X, 4 * SIZE xvst VXZ, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -148,37 +198,66 @@ bge $r0, I, .L997 move YY, Y move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L122 // C!=0 S==0 b .L121 // C!=0 S!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L124 // C==0 S==0 b .L123 // C==0 S!=0 .align 3 .L121: // C!=0 S!=0 xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvst VT0, X, 0 * SIZE xvstelm.d VT1, YY, 0, 0 @@ -201,11 +280,10 @@ xvinsgr2vr.d VX3, t3, 2 xvinsgr2vr.d VX3, t4, 3 add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 - addi.d I, I, -1 + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY @@ -214,13 +292,34 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // C!=0 S==0 +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -269,13 +368,60 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // C==0 S!=0 +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -326,14 +472,63 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // C==0 S==0 xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 0 * SIZE xvst VXZ, X, 4 * SIZE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY @@ -350,29 +545,50 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvst VXZ, X, 0 * SIZE + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 + addi.d X, X, 8 * SIZE blt $r0, I, .L124 + move Y, YY b .L997 .align 3 .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L212 // C!=0 S==0 b .L211 // C!=0 S!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L214 // C==0 S==0 b .L213 // C==0 S!=0 .align 3 .L211: // C!=0 S!=0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -425,6 +641,54 @@ xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L211 @@ -432,6 +696,7 @@ .align 3 .L212: // C!=0 S==0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -480,6 +745,52 @@ add.d XX, XX, INCX xvfmul.d VT1, VX3, VXS xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L212 @@ -487,6 +798,7 @@ .align 3 .L213: // C==0 S!=0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -537,6 +849,53 @@ xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXS, VX2 + xvfmul.s VT1, VXS, VX0 + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L213 @@ -544,6 +903,7 @@ .align 3 .L214: // C==0 S==0 +#ifdef DOUBLE xvstelm.d VXZ, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VXZ, XX, 0, 1 @@ -562,6 +922,25 @@ xvstelm.d VXZ, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L211 @@ -572,20 +951,21 @@ bge $r0, I, .L997 move YY, Y move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L222 // C!=0 S==0 b .L221 // C!=0 S!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L224 // C==0 S==0 b .L223 // C==0 S!=0 .align 3 .L221: // C!=0 S!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -674,12 +1054,99 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L221 b .L997 .align 3 .L222: // C!=0 S==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -764,12 +1231,97 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L222 b .L997 .align 3 .L223: // C==0 S!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -856,12 +1408,98 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L223 b .L997 .align 3 .L224: // C==0 S==0 +#ifdef DOUBLE xvstelm.d VXZ, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VXZ, XX, 0, 1 @@ -893,9 +1531,46 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif b .L997 .align 3 @@ -905,15 +1580,15 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE - fmul.d $f10, $f12, C - fmadd.d $f10, $f13, S, $f10 - fst.d $f10, X, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE addi.d I, I, -1 - fmul.d $f20, $f12, S - fmsub.d $f20, $f13, C, $f20 - fst.d $f20, Y, 0 * SIZE + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 @@ -924,4 +1599,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S new file mode 100644 index 000000000..4b0e59310 --- /dev/null +++ b/kernel/loongarch64/rot_lsx.S @@ -0,0 +1,1791 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, C + vreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + vreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + vreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + vreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE +#endif + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE +#ifdef DOUBLE + vst VXZ, X, 6 * SIZE + vst VXZ, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE +#endif + add.d Y, Y, INCY +#ifndef DOUBLE + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L124 +#ifdef DOUBLE + move Y, YY +#endif + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 +#endif + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 +#ifdef DOUBLE + move X, XX +#endif + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#ifndef DOUBLE + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE + addi.d I, I, -1 + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/srot_lasx.S b/kernel/loongarch64/srot_lasx.S deleted file mode 100644 index 9aeb4dcf5..000000000 --- a/kernel/loongarch64/srot_lasx.S +++ /dev/null @@ -1,863 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VT0 $xr10 -#define VT1 $xr18 -#define VXC $xr23 -#define VXS $xr9 -#define VXZ $xr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, C - xvreplgr2vr.w VXC, t1 - movfr2gr.s t2, S - xvreplgr2vr.w VXS, t2 - movfr2gr.s t3, a1 - xvreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvst VXZ, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXS, VX2 - xvfmul.s VT1, VXS, VX0 - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - fmul.s $f10, $f12, C - fmadd.s $f10, $f13, S, $f10 - fst.s $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.s $f20, $f12, S - fmsub.s $f20, $f13, C, $f20 - fst.s $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/srot_lsx.S b/kernel/loongarch64/srot_lsx.S deleted file mode 100644 index 8822b58e4..000000000 --- a/kernel/loongarch64/srot_lsx.S +++ /dev/null @@ -1,927 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VT0 $vr10 -#define VT1 $vr18 -#define VXC $vr23 -#define VXS $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, C - vreplgr2vr.w VXC, t1 - movfr2gr.s t2, S - vreplgr2vr.w VXS, t2 - movfr2gr.s t3, a1 - vreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, X, 4 * SIZE - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXC, VX0 - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VXS, VX0 - vfmsub.s VT1, VX2, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXC, VX0 - vfmul.s VT1, VX2, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXS - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXS, VX2 - vfmul.s VT1, VXS, VX0 - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - fmul.s $f10, $f12, C - fmadd.s $f10, $f13, S, $f10 - fst.s $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.s $f20, $f12, S - fmsub.s $f20, $f13, C, $f20 - fst.s $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 173a65d4e6ddf8bc5a9b1cd02d594d4b48dc9f89 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 25 Dec 2023 15:11:04 +0800 Subject: [PATCH 067/191] loongarch64: Add and refine iamax optimization functions. --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 7 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 7 +- kernel/loongarch64/camax_lasx.S | 194 ++++++ kernel/loongarch64/camax_lsx.S | 206 +++++++ .../{isamax_lasx.S => iamax_lasx.S} | 254 ++++++-- kernel/loongarch64/iamax_lsx.S | 482 +++++++++++++++ kernel/loongarch64/icamax_lasx.S | 562 ++++++++++++++++++ kernel/loongarch64/icamax_lsx.S | 434 ++++++++++++++ kernel/loongarch64/idamax_lasx.S | 275 --------- kernel/loongarch64/idamax_lsx.S | 267 --------- kernel/loongarch64/isamax_lsx.S | 275 --------- 12 files changed, 2101 insertions(+), 866 deletions(-) create mode 100644 kernel/loongarch64/camax_lasx.S create mode 100644 kernel/loongarch64/camax_lsx.S rename kernel/loongarch64/{isamax_lasx.S => iamax_lasx.S} (55%) create mode 100644 kernel/loongarch64/iamax_lsx.S create mode 100644 kernel/loongarch64/icamax_lasx.S create mode 100644 kernel/loongarch64/icamax_lsx.S delete mode 100644 kernel/loongarch64/idamax_lasx.S delete mode 100644 kernel/loongarch64/idamax_lsx.S delete mode 100644 kernel/loongarch64/isamax_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 13514d6e0..599b4795c 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -139,6 +139,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d #define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d #define XVCMPLT xvfcmp.clt.d #define XVMUL xvfmul.d #define XVMSUB xvfmsub.d @@ -151,6 +152,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.d #define VFMAXA vfmaxa.d #define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d #define VCMPLT vfcmp.clt.d #define VMUL vfmul.d #define VMSUB vfmsub.d @@ -189,6 +191,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s #define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s #define XVCMPLT xvfcmp.clt.s #define XVMUL xvfmul.s #define XVMSUB xvfmsub.s @@ -201,6 +204,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.s #define VFMAXA vfmaxa.s #define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s #define VCMPLT vfcmp.clt.s #define VMUL vfmul.s #define VMSUB vfmsub.s diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index b315c81f2..a8a6dd82f 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lsx.S ISMINKERNEL = imin_lsx.S IDMINKERNEL = imin_lsx.S -ISAMAXKERNEL = isamax_lsx.S -IDAMAXKERNEL = idamax_lsx.S +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S ISAMINKERNEL = iamin_lsx.S IDAMINKERNEL = iamin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 577f6316e..e4c45e1fa 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lasx.S ISMINKERNEL = imin_lasx.S IDMINKERNEL = imin_lasx.S -ISAMAXKERNEL = isamax_lasx.S -IDAMAXKERNEL = idamax_lasx.S +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S ISAMINKERNEL = iamin_lasx.S IDAMINKERNEL = iamin_lasx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..7013430cb --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmax.s VM1, x1, x2 + xvfmax.s VM0, x3, x4 + xvfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..2e55629de --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmax.s VM1, x1, VM1 + vfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmax.s VM1, x1, x2 + vfmax.s VM0, x3, x4 + vfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/iamax_lasx.S similarity index 55% rename from kernel/loongarch64/isamax_lasx.S rename to kernel/loongarch64/iamax_lasx.S index 2800b1d43..090da3004 100644 --- a/kernel/loongarch64/isamax_lasx.S +++ b/kernel/loongarch64/iamax_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,9 +128,25 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else xvld VX0, X, 0 * SIZE addi.d I, I, -1 xvadd.w VI1, VI1, VINC8 @@ -87,10 +155,21 @@ addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -103,28 +182,62 @@ xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 +#endif + XVFMAXA VM1, x1, x2 + XVCMPEQ VT0, x1, VM1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMAXA VM0, x3, x4 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -186,9 +299,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -219,10 +369,30 @@ addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -245,44 +415,45 @@ xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 +#endif + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifdef DOUBLE + +#else .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -306,35 +477,27 @@ xvfmaxa.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 - .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -345,8 +508,9 @@ xvfcmp.ceq.s VT0, VM0, VX0 xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 +#endif -.L21: // N<8 +.L21: //N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -357,17 +521,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmaxa.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +539,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S new file mode 100644 index 000000000..ce5b3c724 --- /dev/null +++ b/kernel/loongarch64/iamax_lsx.S @@ -0,0 +1,482 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L17 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +#ifdef DOUBLE +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +#else +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + VFMAXA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S new file mode 100644 index 000000000..7800cb917 --- /dev/null +++ b/kernel/loongarch64/icamax_lasx.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + xvxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmax.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmax.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d I, I, -1 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.s x1, x1, x2 + xvfmax.s x3, VM0, x1 + xvfcmp.ceq.s VT0, x3, VM0 +#endif + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +#endif +.L21: //N<8 +#ifdef DOUBLE + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S new file mode 100644 index 000000000..a2fc9dbbd --- /dev/null +++ b/kernel/loongarch64/icamax_lsx.S @@ -0,0 +1,434 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + vxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 +#endif + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMAX x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + addi.d I, I, -1 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 + vfcmp.clt.s VT0, x1, VI3 + vfcmp.clt.s VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.s x1, x1, x2 + vfmax.s x3, VM0, x1 + vfcmp.ceq.s VT0, x3, VM0 +#endif + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +#ifdef DOUBLE +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 +#else +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S deleted file mode 100644 index 8248ee757..000000000 --- a/kernel/loongarch64/idamax_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmaxa.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S deleted file mode 100644 index fb2d5bac1..000000000 --- a/kernel/loongarch64/idamax_lsx.S +++ /dev/null @@ -1,267 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L11 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L16 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L17 - .align 3 - -.L16: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L17: - movfr2gr.d i0, $f20 - .align 3 - -.L11: //INCX==1 and N<8 - andi I, N, 7 - bge $r0, I, .L14 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L13: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - addi.d X, X, SIZE - movgr2fr.d $f21, i1 - blt $r0, I, .L13 - movfr2gr.d i0, $f20 - .align 3 - -.L14: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S deleted file mode 100644 index a18aa7354..000000000 --- a/kernel/loongarch64/isamax_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmaxa.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 179ed51d3b2df5e0df8a28d184bd169efa7f2b61 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 21 Dec 2023 14:18:39 +0800 Subject: [PATCH 068/191] Add dgemm_kernel_8x4.S file. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 14 + kernel/loongarch64/dgemm_kernel_8x4.S | 2894 ++++++++++++++++++++++ param.h | 4 +- 3 files changed, 2910 insertions(+), 2 deletions(-) create mode 100644 kernel/loongarch64/dgemm_kernel_8x4.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a8a6dd82f..a78c0dbc5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -58,4 +58,18 @@ DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +DGEMMKERNEL = dgemm_kernel_8x4.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S new file mode 100644 index 000000000..405f1bd97 --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_8x4.S @@ -0,0 +1,2894 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r26 +#define T2 $r27 +#define ZERO $r0 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define D14 $vr30 +#define D15 $vr31 +#define VALPHA $vr15 + +/* Prefetch interval */ +#define A_PRE 0x200 +#define B_PRE 0x100 + +.macro KERNEL2x8x4 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vldrepl.d U13, B0, 0x08 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vldrepl.d U14, B0, 0x10 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U15, B0, 0x18 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U13, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U14, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U15, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vldrepl.d U8, A0, 0x08 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vld U12, B0, 0x20 + vld U13, B0, 0x30 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x40 +.endm + +.macro KERNEL2x1x4_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x8x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 +.endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x2_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x10 + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + vldrepl.d U8, A0, 0x08 + vld U12, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x2_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x8x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vldrepl.d U4, B0, 0x00 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vld U8, A0, 0x40 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x50 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x60 + vld U11, A0, 0x70 + + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x1_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vldrepl.d U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + + + PROLOGUE + + addi.d $sp, $sp, -112 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + ST $f30, $sp, 88 + ST $f31, $sp, 96 + ST ALPHA, $sp, 104 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + preld 0, C0, 0x20 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + preld 0, C1, 0x20 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + vldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + preld 0, C2, 0x20 + vfmul.d D10, U2, U6 + vfmul.d D11, U3, U6 + + vldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + preld 0, C3, 0x20 + vfmul.d D14, U2, U7 + vfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x8x4 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + +.L_TL1_END: + KERNEL8x8x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + /* Cumulative D0~D15 */ + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D10, D10, VALPHA + vfmul.d D11, D11, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA + vfmul.d D14, D14, VALPHA + vfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ + vld U8, C2, 0x00 + vld U9, C2, 0x10 + vld U10, C2, 0x20 + vld U11, C2, 0x30 + vfmadd.d D8, D8, VALPHA, U8 + vfmadd.d D9, D9, VALPHA, U9 + vfmadd.d D10, D10, VALPHA, U10 + vfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + vld U0, C3, 0x00 + vld U1, C3, 0x10 + vld U2, C3, 0x20 + vld U3, C3, 0x30 + vfmadd.d D12, D12, VALPHA, U0 + vfmadd.d D13, D13, VALPHA, U1 + vfmadd.d D14, D14, VALPHA, U2 + vfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + vst D10, C2, 0x20 + vst D11, C2, 0x30 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + vst D14, C3, 0x20 + vst D15, C3, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_M0 + + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + vldrepl.d U6, B0, 0x10 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + + vldrepl.d U7, B0, 0x18 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ + vld U4, C2, 0x00 + vld U5, C2, 0x10 + vfmadd.d D8, D8, VALPHA, U4 + vfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ + vld U6, C3, 0x00 + vld U7, C3, 0x10 + vfmadd.d D12, D12, VALPHA, U6 + vfmadd.d D13, D13, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + vfmul.d D2, U1, U4 + vfmul.d D3, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 + vstelm.d D2, C0, 0x08, 0x00 + vstelm.d D2, C1, 0x08, 0x01 + vstelm.d D3, C2, 0x08, 0x00 + vstelm.d D3, C3, 0x08, 0x01 +#else + /* Load C0 */ + vld U0, C0, 0x00 + /* Load C1 */ + vld U1, C1, 0x00 + /* Load C2 */ + vld U2, C2, 0x00 + /* Load C3 */ + vld U3, C3, 0x00 + + vilvl.d D4, D2, D0 //C0 + vilvh.d D5, D2, D0 //C1 + vilvl.d D6, D3, D1 //C2 + vilvh.d D7, D3, D1 //C3 + + vfmadd.d D0, D4, VALPHA, U0 + vfmadd.d D2, D5, VALPHA, U1 + vfmadd.d D1, D6, VALPHA, U2 + vfmadd.d D3, D7, VALPHA, U3 + + vst D0, C0, 0x00 + vst D2, C1, 0x00 + vst D1, C2, 0x00 + vst D3, C3, 0x00 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vldrepl.d U1, C1, 0x00 + vilvl.d D4, U1, U0 + vfmadd.d D6, D0, VALPHA, D4 + + vldrepl.d U2, C2, 0x00 + vldrepl.d U3, C3, 0x00 + vilvl.d D5, U3, U2 + vfmadd.d D7, D1, VALPHA, D5 + + vstelm.d D6, C0, 0x00, 0x00 + vstelm.d D6, C1, 0x00, 0x01 + vstelm.d D7, C2, 0x00, 0x00 + vstelm.d D7, C3, 0x00, 0x01 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + + vldrepl.d VALPHA, $sp, 104 + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N3_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x8x2_END + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 8, considering M=4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N3_M0 + + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + vldrepl.d U4, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + vfmadd.d D0, U0, U4, D0 + + vfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + vld U1, C1, 0x00 + vfmadd.d D4, D4, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + vst D0, C0, 0x00 + vst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C1, 0x00 + vilvl.d U2, U1, U0 + vfmadd.d D0, D0, VALPHA, U2 +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N1_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a3, a2} + * U1 = {a1, a0} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x8x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x8x1_END + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N1_M0 + + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f24, $sp, 40 + LD $f25, $sp, 48 + LD $f26, $sp, 56 + LD $f27, $sp, 64 + LD $f28, $sp, 72 + LD $f29, $sp, 80 + LD $f30, $sp, 88 + LD $f31, $sp, 96 + addi.d $sp, $sp, 112 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/param.h b/param.h index ee4640f57..e4e242d5d 100644 --- a/param.h +++ b/param.h @@ -2888,8 +2888,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 From e771be185ee3ae604ce0b6ffb0dc38258b04f866 Mon Sep 17 00:00:00 2001 From: guxiwei Date: Thu, 21 Dec 2023 14:28:06 +0800 Subject: [PATCH 069/191] Optimize copy functions with lsx. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 8 +- kernel/loongarch64/dgemm_ncopy_4_lsx.S | 185 +++++++ kernel/loongarch64/dgemm_ncopy_8_lsx.S | 283 +++++++++++ kernel/loongarch64/dgemm_tcopy_4_lsx.S | 280 +++++++++++ kernel/loongarch64/dgemm_tcopy_8_lsx.S | 597 +++++++++++++++++++++++ 5 files changed, 1349 insertions(+), 4 deletions(-) create mode 100644 kernel/loongarch64/dgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/dgemm_ncopy_8_lsx.S create mode 100644 kernel/loongarch64/dgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/dgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a78c0dbc5..00cb769eb 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -59,10 +59,10 @@ SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S DGEMMKERNEL = dgemm_kernel_8x4.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = dgemm_ncopy_8_lsx.S +DGEMMITCOPY = dgemm_tcopy_8_lsx.S +DGEMMONCOPY = dgemm_ncopy_4_lsx.S +DGEMMOTCOPY = dgemm_tcopy_4_lsx.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..048a49af6 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S @@ -0,0 +1,185 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 +.L_I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_2I3: + andi I, M, 0x01 + beq ZERO, I, .L_N1 +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..30bebe8df --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -0,0 +1,283 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + push_if_used 26, 32 + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x03 + beq J, ZERO, .L_N4 +.L_J1: + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_I7 +.L_I1: + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ + U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ + U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ + U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ + U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 +.L_4I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..134066471 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -0,0 +1,280 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 18, 8 + + move S0, SRC + move P0, DST + + // Find P0, P2, P3 + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + pop_if_used 18, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..a7e3ef69c --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -0,0 +1,597 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x03 + srai.d T1, N, 0x02 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x02 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x01 + slli.d T0, T0, 0x01 + mul.d P4, M, T0 + slli.d P4, P4, 0x03 + add.d P4, DST, P4 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x06 + beq ZERO, J, .L_M7 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x03 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S5, 0x20 + vld U3, S5, 0x30 + vld U4, S6, 0x00 + vld U5, S6, 0x10 + vld U6, S6, 0x20 + vld U7, S6, 0x30 + + vst U0, P1, 0x100 + vst U1, P1, 0x110 + vst U2, P1, 0x120 + vst U3, P1, 0x130 + vst U4, P1, 0x140 + vst U5, P1, 0x150 + vst U6, P1, 0x160 + vst U7, P1, 0x170 + + vld U0, S7, 0x00 + vld U1, S7, 0x10 + vld U2, S7, 0x20 + vld U3, S7, 0x30 + vld U4, S8, 0x00 + vld U5, S8, 0x10 + vld U6, S8, 0x20 + vld U7, S8, 0x30 + + vst U0, P1, 0x180 + vst U1, P1, 0x190 + vst U2, P1, 0x1a0 + vst U3, P1, 0x1b0 + vst U4, P1, 0x1c0 + vst U5, P1, 0x1d0 + vst U6, P1, 0x1e0 + vst U7, P1, 0x1f0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S6, 0x00 + vld U3, S6, 0x10 + vld U4, S7, 0x00 + vld U5, S7, 0x10 + vld U6, S8, 0x00 + vld U7, S8, 0x10 + + vst U0, P2, 0x80 + vst U1, P2, 0x90 + vst U2, P2, 0xa0 + vst U3, P2, 0xb0 + vst U4, P2, 0xc0 + vst U5, P2, 0xd0 + vst U6, P2, 0xe0 + vst U7, P2, 0xf0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + vld U4, S5, 0x00 + vld U5, S6, 0x00 + vld U6, S7, 0x00 + vld U7, S8, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + vst U4, P3, 0x40 + vst U5, P3, 0x50 + vst U6, P3, 0x60 + vst U7, P3, 0x70 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x03 + beq ZERO, I, .L_4N7 +.L_4I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x03 + beq ZERO, I, .L_2N7 +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x03 + beq ZERO, I, .L_1N7 +.L_1I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + vld U0, S1, 0x00 + vst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE From 06fd5b5995e66e7b54bb4b8496a3b946cd56212e Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 10:44:02 +0800 Subject: [PATCH 070/191] loongarch64: Add and Refine asum optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- kernel/loongarch64/asum_lasx.S | 257 ++++++++++++++++ kernel/loongarch64/asum_lsx.S | 258 ++++++++++++++++ kernel/loongarch64/casum_lasx.S | 329 +++++++++++++++++++++ kernel/loongarch64/casum_lsx.S | 358 +++++++++++++++++++++++ kernel/loongarch64/dasum_lasx.S | 148 ---------- kernel/loongarch64/dasum_lsx.S | 158 ---------- kernel/loongarch64/sasum_lasx.S | 157 ---------- kernel/loongarch64/sasum_lsx.S | 148 ---------- 10 files changed, 1210 insertions(+), 615 deletions(-) create mode 100644 kernel/loongarch64/asum_lasx.S create mode 100644 kernel/loongarch64/asum_lsx.S create mode 100644 kernel/loongarch64/casum_lasx.S create mode 100644 kernel/loongarch64/casum_lsx.S delete mode 100644 kernel/loongarch64/dasum_lasx.S delete mode 100644 kernel/loongarch64/dasum_lsx.S delete mode 100644 kernel/loongarch64/sasum_lasx.S delete mode 100644 kernel/loongarch64/sasum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 00cb769eb..201427dcd 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S -SASUMKERNEL = sasum_lsx.S -DASUMKERNEL = dasum_lsx.S +SASUMKERNEL = asum_lsx.S +DASUMKERNEL = asum_lsx.S +CASUMKERNEL = casum_lsx.S +ZASUMKERNEL = casum_lsx.S SROTKERNEL = rot_lsx.S DROTKERNEL = rot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e4c45e1fa..e822cb630 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S -SASUMKERNEL = sasum_lasx.S -DASUMKERNEL = dasum_lasx.S +SASUMKERNEL = asum_lasx.S +DASUMKERNEL = asum_lasx.S +CASUMKERNEL = casum_lasx.S +ZASUMKERNEL = casum_lasx.S SROTKERNEL = rot_lasx.S DROTKERNEL = rot_lasx.S diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S new file mode 100644 index 000000000..9a2c031f3 --- /dev/null +++ b/kernel/loongarch64/asum_lasx.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S new file mode 100644 index 000000000..512b01404 --- /dev/null +++ b/kernel/loongarch64/asum_lsx.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S new file mode 100644 index 000000000..caf0ff969 --- /dev/null +++ b/kernel/loongarch64/casum_lasx.S @@ -0,0 +1,329 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res0 $xr19 +#define neg1 $xr20 +#define VT0 $xr21 +#define VT1 $xr22 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfmul.d VX0, neg1, VX2 + xvfmul.d VX1, neg1, VX3 + xvfcmp.clt.d VT0, VX2, res0 + xvfcmp.clt.d VT1, VX3, res0 + xvbitsel.v VX2, VX2, VX0, VT0 + xvbitsel.v VX3, VX3, VX1, VT1 + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S new file mode 100644 index 000000000..4822f2080 --- /dev/null +++ b/kernel/loongarch64/casum_lsx.S @@ -0,0 +1,358 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res0 $vr19 +#define neg1 $vr20 +#define VT0 $vr21 +#define VT1 $vr22 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vld VX0, X, 8 * SIZE + vld VX1, X, 12 * SIZE + addi.d I, I, -1 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res3, VX1, VX0 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfmul.s VX0, neg1, VX2 + vfmul.s VX1, neg1, VX3 + vfcmp.clt.s VT0, VX2, res0 + vfcmp.clt.s VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dasum_lasx.S b/kernel/loongarch64/dasum_lasx.S deleted file mode 100644 index 49de98c40..000000000 --- a/kernel/loongarch64/dasum_lasx.S +++ /dev/null @@ -1,148 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define VT0 $xr23 -#define VT1 $xr22 -#define res1 $xr16 -#define res2 $xr17 -#define res0 $xr18 -#define neg1 $xr19 - - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - xvxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d t1, -1 - xvreplgr2vr.d neg1, t1 - xvffint.d.l neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvfmul.d VX2, neg1, VX0 - xvfmul.d VX3, neg1, VX1 - xvfcmp.clt.d VT0, VX0, res0 - xvfcmp.clt.d VT1, VX1, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvbitsel.v VX1, VX1, VX3, VT1 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvfmul.d VX2, neg1, VX0 - xvfmul.d VX3, neg1, VX1 - xvfcmp.clt.d VT0, VX0, res0 - xvfcmp.clt.d VT1, VX1, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvbitsel.v VX1, VX1, VX3, VT1 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dasum_lsx.S b/kernel/loongarch64/dasum_lsx.S deleted file mode 100644 index 94750815e..000000000 --- a/kernel/loongarch64/dasum_lsx.S +++ /dev/null @@ -1,158 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define VT0 $vr23 -#define VT1 $vr22 -#define res1 $vr16 -#define res2 $vr17 -#define res0 $vr18 -#define neg1 $vr19 - - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - vxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d t1, -1 - vreplgr2vr.d neg1, t1 - vffint.d.l neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lasx.S b/kernel/loongarch64/sasum_lasx.S deleted file mode 100644 index a452701aa..000000000 --- a/kernel/loongarch64/sasum_lasx.S +++ /dev/null @@ -1,157 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define VT0 $xr23 -#define VT1 $xr22 -#define res1 $xr16 -#define res2 $xr17 -#define res0 $xr18 -#define neg1 $xr19 - - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - xvxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.w t1, -1 - xvreplgr2vr.w neg1, t1 - xvffint.s.w neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvfmul.s VX2, neg1, VX0 - xvfcmp.clt.s VT0, VX0, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvfadd.s res1, VX0, res1 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmul.s VX2, neg1, VX0 - xvfcmp.clt.s VT0, VX0, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvfadd.s res1, VX0, res1 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lsx.S b/kernel/loongarch64/sasum_lsx.S deleted file mode 100644 index 87026a144..000000000 --- a/kernel/loongarch64/sasum_lsx.S +++ /dev/null @@ -1,148 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define VT0 $vr23 -#define VT1 $vr22 -#define res1 $vr16 -#define res2 $vr17 -#define res0 $vr18 -#define neg1 $vr19 - - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - vxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.w t1, -1 - vreplgr2vr.w neg1, t1 - vffint.s.w neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX2, neg1, VX0 - vfmul.s VX3, neg1, VX1 - vfcmp.clt.s VT0, VX0, res0 - vfcmp.clt.s VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfmul.s VX2, neg1, VX0 - vfmul.s VX3, neg1, VX1 - vfcmp.clt.s VT0, VX0, res0 - vfcmp.clt.s VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 0753848e03e1298c162386df467f78bc15851cc4 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 16:54:01 +0800 Subject: [PATCH 071/191] loongarch64: Refine and add axpy optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- .../loongarch64/{daxpy_lasx.S => axpy_lasx.S} | 237 +++++- .../loongarch64/{daxpy_lsx.S => axpy_lsx.S} | 254 ++++++- kernel/loongarch64/caxpy_lasx.S | 707 ++++++++++++++++++ kernel/loongarch64/caxpy_lsx.S | 679 +++++++++++++++++ kernel/loongarch64/saxpy_lasx.S | 323 -------- kernel/loongarch64/saxpy_lsx.S | 338 --------- 8 files changed, 1839 insertions(+), 711 deletions(-) rename kernel/loongarch64/{daxpy_lasx.S => axpy_lasx.S} (52%) rename kernel/loongarch64/{daxpy_lsx.S => axpy_lsx.S} (53%) create mode 100644 kernel/loongarch64/caxpy_lasx.S create mode 100644 kernel/loongarch64/caxpy_lsx.S delete mode 100644 kernel/loongarch64/saxpy_lasx.S delete mode 100644 kernel/loongarch64/saxpy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 201427dcd..bdde126ad 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lsx.S SSWAPKERNEL = swap_lsx.S DSWAPKERNEL = swap_lsx.S -SAXPYKERNEL = saxpy_lsx.S -DAXPYKERNEL = daxpy_lsx.S +SAXPYKERNEL = axpy_lsx.S +DAXPYKERNEL = axpy_lsx.S +CAXPYKERNEL = caxpy_lsx.S +ZAXPYKERNEL = caxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e822cb630..7642b2a4d 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lasx.S SSWAPKERNEL = swap_lasx.S DSWAPKERNEL = swap_lasx.S -SAXPYKERNEL = saxpy_lasx.S -DAXPYKERNEL = daxpy_lasx.S +SAXPYKERNEL = axpy_lasx.S +DAXPYKERNEL = axpy_lasx.S +CAXPYKERNEL = caxpy_lasx.S +ZAXPYKERNEL = caxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/axpy_lasx.S similarity index 52% rename from kernel/loongarch64/daxpy_lasx.S rename to kernel/loongarch64/axpy_lasx.S index bafd871ab..707fd09b5 100644 --- a/kernel/loongarch64/daxpy_lasx.S +++ b/kernel/loongarch64/axpy_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 +#else + xvreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -70,6 +102,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -77,6 +116,7 @@ .align 3 .L112: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -86,6 +126,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L112 @@ -97,11 +144,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -114,6 +161,7 @@ .align 3 .L121: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -158,6 +206,50 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -169,11 +261,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -185,6 +277,7 @@ .align 3 .L211: +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -217,6 +310,37 @@ addi.d I, I, -1 xvst VX3, Y, 4 * SIZE addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE +#endif blt $r0, I, .L211 .align 3 @@ -226,11 +350,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -243,6 +367,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -309,6 +434,73 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -319,15 +511,14 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 - b .L999 .align 3 .L999: diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/axpy_lsx.S similarity index 53% rename from kernel/loongarch64/daxpy_lsx.S rename to kernel/loongarch64/axpy_lsx.S index fc88f0bb9..0d74e2bce 100644 --- a/kernel/loongarch64/daxpy_lsx.S +++ b/kernel/loongarch64/axpy_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE vreplgr2vr.d VXA, t1 +#else + vreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -75,16 +107,27 @@ vld VX3, Y, 6 * SIZE vfadd.d VX2, VX0, VX2 vfadd.d VX3, VX1, VX3 - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L111 b .L113 .align 3 .L112: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -104,6 +147,19 @@ vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE addi.d Y, Y, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 +#endif blt $r0, I, .L112 .align 3 @@ -113,11 +169,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -130,6 +186,7 @@ .align 3 .L121: +#ifdef DOUBLE vld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -180,6 +237,54 @@ add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L121 .align 3 @@ -189,11 +294,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -205,6 +310,7 @@ .align 3 .L211: +#ifdef DOUBLE vld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -242,6 +348,39 @@ vfmadd.d VX3, VX1, VXA, VX3 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 .align 3 @@ -252,11 +391,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -269,6 +408,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -337,6 +477,74 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -347,11 +555,11 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S new file mode 100644 index 000000000..2b970fe70 --- /dev/null +++ b/kernel/loongarch64/caxpy_lasx.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE +#endif + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif + addi.d I, I, -1 +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 +#else + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S new file mode 100644 index 000000000..85598d0b9 --- /dev/null +++ b/kernel/loongarch64/caxpy_lsx.S @@ -0,0 +1,679 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + vreplgr2vr.w VXAI, t2 +#endif + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 +#endif + add.d Y, Y, INCY +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S deleted file mode 100644 index 609e26328..000000000 --- a/kernel/loongarch64/saxpy_lasx.S +++ /dev/null @@ -1,323 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define XX $r5 -#define YY $r6 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VXA $xr23 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L999 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - xvreplgr2vr.w VXA, t1 - - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L113 - fcmp.ceq.s $fcc0, ALPHA, a2 - bceqz $fcc0, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvfadd.s VX2, VX0, VX2 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - b .L113 - .align 3 - -.L112: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvfmadd.s VX2, VX0, VXA, VX2 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L112 - .align 3 - -.L113: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L114: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L114 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - move YY, Y - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - move YY, Y - .align 3 - -.L222: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S deleted file mode 100644 index f47415ed6..000000000 --- a/kernel/loongarch64/saxpy_lsx.S +++ /dev/null @@ -1,338 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define XX $r5 -#define YY $r6 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VXA $vr23 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L999 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - vreplgr2vr.w VXA, t1 - - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L113 - fcmp.ceq.s $fcc0, ALPHA, a2 - bceqz $fcc0, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfadd.s VX2, VX0, VX2 - vfadd.s VX3, VX1, VX3 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L113 - .align 3 - -.L112: - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmadd.s VX2, VX0, VXA, VX2 - vfmadd.s VX3, VX1, VXA, VX3 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L113 - .align 3 - -.L113: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L114: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L114 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - move YY, Y - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX2, VX0, VXA, VX2 - vld VX1, X, 4 * SIZE - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmadd.s VX2, VX0, VXA, VX2 - vld VX3, Y, 4 * SIZE - vst VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - move YY, Y - .align 3 - -.L222: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX2, VX0, VXA, VX2 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 8785e948b534a7747611b36b46a69f5813a18cc2 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 17:04:46 +0800 Subject: [PATCH 072/191] loongarch64: Add camin optimization function. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 1 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 1 + kernel/loongarch64/camin_lasx.S | 199 +++++++++++++++++++++ kernel/loongarch64/camin_lsx.S | 211 +++++++++++++++++++++++ 4 files changed, 412 insertions(+) create mode 100644 kernel/loongarch64/camin_lasx.S create mode 100644 kernel/loongarch64/camin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index bdde126ad..fc1766ff5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S +CAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 7642b2a4d..7de9d4440 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S +CAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S new file mode 100644 index 000000000..d7931d30a --- /dev/null +++ b/kernel/loongarch64/camin_lasx.S @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define TEMP $r16 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + MTC s1, $r0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + xvreplve0.w VM0, VM0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmin.s VM1, x1, x2 + xvfmin.s VM0, x3, x4 + xvfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S new file mode 100644 index 000000000..e9ad6b04d --- /dev/null +++ b/kernel/loongarch64/camin_lsx.S @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + MTC s1, $r0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + vreplvei.w VM0, VM0, 0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmin.s VM1, x1, VM1 + vfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmin.s VM1, x1, x2 + vfmin.s VM0, x3, x4 + vfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 2a34fb4b80d494d76da7a9ea6a3d54dffbd57f37 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 18:17:51 +0800 Subject: [PATCH 073/191] loongarch64: Add and refine scal optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- kernel/loongarch64/cscal_lasx.S | 645 +++++++++++++++++++++++ kernel/loongarch64/cscal_lsx.S | 571 ++++++++++++++++++++ kernel/loongarch64/scal_lasx.S | 282 ++++++++++ kernel/loongarch64/scal_lsx.S | 301 +++++++++++ kernel/loongarch64/sscal_lasx.S | 188 ------- kernel/loongarch64/sscal_lsx.S | 194 ------- 8 files changed, 1807 insertions(+), 386 deletions(-) create mode 100644 kernel/loongarch64/cscal_lasx.S create mode 100644 kernel/loongarch64/cscal_lsx.S create mode 100644 kernel/loongarch64/scal_lasx.S create mode 100644 kernel/loongarch64/scal_lsx.S delete mode 100644 kernel/loongarch64/sscal_lasx.S delete mode 100644 kernel/loongarch64/sscal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index fc1766ff5..7abdae55a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -4,8 +4,10 @@ SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S -SSCALKERNEL = sscal_lsx.S -DSCALKERNEL = dscal_lsx.S +SSCALKERNEL = scal_lsx.S +DSCALKERNEL = scal_lsx.S +CSCALKERNEL = cscal_lsx.S +ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 7de9d4440..13f9f23ed 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -4,8 +4,10 @@ SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S -SSCALKERNEL = sscal_lasx.S -DSCALKERNEL = dscal_lasx.S +SSCALKERNEL = scal_lasx.S +DSCALKERNEL = scal_lasx.S +CSCALKERNEL = cscal_lasx.S +ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S new file mode 100644 index 000000000..3605a6c0e --- /dev/null +++ b/kernel/loongarch64/cscal_lasx.S @@ -0,0 +1,645 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + xvreplgr2vr.d VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + xvreplgr2vr.w VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 +#else + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S new file mode 100644 index 000000000..f442a754f --- /dev/null +++ b/kernel/loongarch64/cscal_lsx.S @@ -0,0 +1,571 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 +#endif + vxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + vst VXZ, X, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, X, 6 * SIZE +#else + vst VXZ, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 +#else + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S new file mode 100644 index 000000000..48e2c0718 --- /dev/null +++ b/kernel/loongarch64/scal_lasx.S @@ -0,0 +1,282 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA, TEMP +#else + xvreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VT0, VX0, VALPHA + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE +#ifdef DOUBLE + xvst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA , TEMP +#else + xvreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE +#else + xvfmul.s VT0, VX0, VALPHA + xvst VT0, X, 0 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S new file mode 100644 index 000000000..1ffce7db2 --- /dev/null +++ b/kernel/loongarch64/scal_lsx.S @@ -0,0 +1,301 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA, TEMP +#else + vreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + vfmul.s VT0, VX0, VALPHA + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vfmul.s VT1, VX1, VALPHA + vstelm.w VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 3 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 + +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE +#ifdef DOUBLE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE +#else + vst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA , TEMP +#else + vreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VT0, VX0, VALPHA + vfmul.s VT1, VX1, VALPHA + addi.d I, I, -1 + vst VT0, X, 0 * SIZE + vst VT1, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sscal_lasx.S b/kernel/loongarch64/sscal_lasx.S deleted file mode 100644 index 329f24659..000000000 --- a/kernel/loongarch64/sscal_lasx.S +++ /dev/null @@ -1,188 +0,0 @@ -#define ASSEMBLER -#include "common.h" - -#define N $r4 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define XX $r16 -#define VX0 $xr12 -#define VX1 $xr13 -#define VT0 $xr14 -#define VT1 $xr15 -#define VALPHA $xr19 -#define a1 $f8 -#define a2 $f23 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 - fcmp.ceq.s $fcc0, ALPHA, a2 - bcnez $fcc0, .L999 //ALPHA==1 return - srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 - movfr2gr.s TEMP, ALPHA - xvreplgr2vr.w VALPHA, TEMP - move XX, X - -.L10: //ALPHA!=0|1 and INCX!=1 - bge $r0, I, .L32 - .align 3 -.L11: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmul.s VT0, VX0, VALPHA - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L11 - b .L32 - .align 3 - -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L23: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvst VX0, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L27: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - -.L30: - bge $r0, I, .L32/*N<8 INCX==1*/ - movfr2gr.s TEMP, ALPHA - xvreplgr2vr.w VALPHA , TEMP - .align 3 - -.L31: - xvld VX0, X, 0 * SIZE - addi.d I, I, -1 - xvfmul.s VT0, VX0, VALPHA - xvst VT0, X, 0 * SIZE - addi.d X, X, 8 * SIZE - blt $r0, I, .L31 - .align 3 - -.L32: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L33: - fld.s a1, X, 0 * SIZE - addi.d I, I, -1 - fmul.s a1, ALPHA, a1 - fst.s a1, X, 0 * SIZE - add.d X, X, INCX - blt $r0, I, .L33 - jirl $r0, $r1, 0 - .align 3 - -.L999: - jirl $r0, $r1, 0x0 - - EPILOGUE diff --git a/kernel/loongarch64/sscal_lsx.S b/kernel/loongarch64/sscal_lsx.S deleted file mode 100644 index d0ea1307d..000000000 --- a/kernel/loongarch64/sscal_lsx.S +++ /dev/null @@ -1,194 +0,0 @@ -#define ASSEMBLER -#include "common.h" - -#define N $r4 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define XX $r16 -#define VX0 $vr12 -#define VX1 $vr13 -#define VT0 $vr14 -#define VT1 $vr15 -#define VALPHA $vr19 -#define a1 $f8 -#define a2 $f23 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 - fcmp.ceq.s $fcc0, ALPHA, a2 - bcnez $fcc0, .L999 //ALPHA==1 return - srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 - movfr2gr.s TEMP, ALPHA - vreplgr2vr.w VALPHA, TEMP - move XX, X - .align 3 - -.L10: //ALPHA!=0|1 and INCX!=1 - bge $r0, I, .L32 - .align 3 -.L11: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - vfmul.s VT0, VX0, VALPHA - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vfmul.s VT1, VX1, VALPHA - vstelm.w VT1, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 3 - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L11 - b .L32 - .align 3 - -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L23: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 -.L25: - vxor.v VX0, VX0, VX0 - vst VX0, X, 0 * SIZE - vst VX0, X, 4 * SIZE - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L27: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - -.L30: - bge $r0, I, .L32/*N<8 INCX==1*/ - movfr2gr.s TEMP, ALPHA - vreplgr2vr.w VALPHA , TEMP - .align 3 - -.L31: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VT0, VX0, VALPHA - vfmul.s VT1, VX1, VALPHA - addi.d I, I, -1 - vst VT0, X, 0 * SIZE - vst VT1, X, 4 * SIZE - addi.d X, X, 8 * SIZE - blt $r0, I, .L31 - .align 3 - -.L32: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L33: - fld.s a1, X, 0 * SIZE - addi.d I, I, -1 - fmul.s a1, ALPHA, a1 - fst.s a1, X, 0 * SIZE - add.d X, X, INCX - blt $r0, I, .L33 - jirl $r0, $r1, 0 - .align 3 - -.L999: - jirl $r0, $r1, 0x0 - - EPILOGUE From 65a0aeb128a46b75b189d3a21b619d005351b2d2 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 17:45:17 +0800 Subject: [PATCH 074/191] loongarch64: Add c/zcopy optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/ccopy_lasx.S | 386 +++++++++++++++++++++ kernel/loongarch64/ccopy_lsx.S | 411 +++++++++++++++++++++++ 4 files changed, 801 insertions(+) create mode 100644 kernel/loongarch64/ccopy_lasx.S create mode 100644 kernel/loongarch64/ccopy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 7abdae55a..2aa68af67 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S +CCOPYKERNEL = ccopy_lsx.S +ZCOPYKERNEL = ccopy_lsx.S SSWAPKERNEL = swap_lsx.S DSWAPKERNEL = swap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 13f9f23ed..3bcec2c62 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S +CCOPYKERNEL = ccopy_lasx.S +ZCOPYKERNEL = ccopy_lasx.S SSWAPKERNEL = swap_lasx.S DSWAPKERNEL = swap_lasx.S diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S new file mode 100644 index 000000000..fbc5d96bc --- /dev/null +++ b/kernel/loongarch64/ccopy_lasx.S @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0 * SIZE, 2 + xvstelm.d VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 0 + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 2 + xvstelm.d VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 0 + xvstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 2 + xvstelm.d VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 0 + xvstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 2 + xvstelm.d VX3, Y, 1 * SIZE, 3 +#else + xvld VX1, X, 8 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 2 + xvstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 4 + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 6 + xvstelm.w VX0, Y, 1 * SIZE, 7 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 0 + xvstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 2 + xvstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 4 + xvstelm.w VX1, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 6 + xvstelm.w VX1, Y, 1 * SIZE, 7 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S new file mode 100644 index 000000000..4c4d880f1 --- /dev/null +++ b/kernel/loongarch64/ccopy_lsx.S @@ -0,0 +1,411 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11:// INCX==1 and INCY==1 + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + addi.d I, I, -1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + vst VX2, Y, 12 * SIZE + vst VX3, Y, 14 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0 * SIZE, 2 + vstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 0 + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 2 + vstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 0 + vstelm.w VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 2 + vstelm.w VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 0 + vstelm.w VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 2 + vstelm.w VX3, Y, 1 * SIZE, 3 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 12 * SIZE + vst VX1, Y, 14 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d Y, Y, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From d97272cb35af5849515b6a5850ddfe642e29430f Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 19:09:18 +0800 Subject: [PATCH 075/191] loongarch64: Add c/zdot optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/cdot_lasx.S | 565 +++++++++++++++++++++++ kernel/loongarch64/cdot_lsx.S | 397 ++++++++++++++++ 4 files changed, 966 insertions(+) create mode 100644 kernel/loongarch64/cdot_lasx.S create mode 100644 kernel/loongarch64/cdot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 2aa68af67..c70120f9a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -3,6 +3,8 @@ ifndef NO_LSX SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +CDOTKERNEL = cdot_lsx.S +ZDOTKERNEL = cdot_lsx.S SSCALKERNEL = scal_lsx.S DSCALKERNEL = scal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3bcec2c62..98673ae09 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -3,6 +3,8 @@ ifndef NO_LASX SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +CDOTKERNEL = cdot_lasx.S +ZDOTKERNEL = cdot_lasx.S SSCALKERNEL = scal_lasx.S DSCALKERNEL = scal_lasx.S diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S new file mode 100644 index 000000000..0583e56ea --- /dev/null +++ b/kernel/loongarch64/cdot_lasx.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res4 $xr19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define x1 $xr20 +#define x2 $xr21 +#define x3 $xr22 +#define x4 $xr23 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res3, res3, res3 + xvxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + addi.d X, X, 8 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + addi.d X, X, 16 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d Y, Y, 8 * SIZE + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d Y, Y, 8 * SIZE + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + xvpickve.d VX1, res2, 1 + xvpickve.d VX2, res2, 2 + xvpickve.d VX3, res2, 3 + xvfadd.d res2, VX1, res2 + xvfadd.d res2, VX2, res2 + xvfadd.d res2, VX3, res2 + xvpickve.d VX1, res3, 1 + xvpickve.d VX2, res3, 2 + xvpickve.d VX3, res3, 3 + xvfadd.d res3, VX1, res3 + xvfadd.d res3, VX2, res3 + xvfadd.d res3, VX3, res3 + xvpickve.d VX1, res4, 1 + xvpickve.d VX2, res4, 2 + xvpickve.d VX3, res4, 3 + xvfadd.d res4, VX1, res4 + xvfadd.d res4, VX2, res4 + xvfadd.d res4, VX3, res4 +#else + xvpickve.w VX0, res1, 1 + xvpickve.w VX1, res1, 2 + xvpickve.w VX2, res1, 3 + xvpickve.w VX3, res1, 4 + xvpickve.w x1, res1, 5 + xvpickve.w x2, res1, 6 + xvpickve.w x3, res1, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, x1, res1 + xvfadd.s res1, x2, res1 + xvfadd.s res1, x3, res1 + xvpickve.w VX0, res2, 1 + xvpickve.w VX1, res2, 2 + xvpickve.w VX2, res2, 3 + xvpickve.w VX3, res2, 4 + xvpickve.w x1, res2, 5 + xvpickve.w x2, res2, 6 + xvpickve.w x3, res2, 7 + xvfadd.s res2, VX0, res2 + xvfadd.s res2, VX1, res2 + xvfadd.s res2, VX2, res2 + xvfadd.s res2, VX3, res2 + xvfadd.s res2, x1, res2 + xvfadd.s res2, x2, res2 + xvfadd.s res2, x3, res2 + xvpickve.w VX0, res3, 1 + xvpickve.w VX1, res3, 2 + xvpickve.w VX2, res3, 3 + xvpickve.w VX3, res3, 4 + xvpickve.w x1, res3, 5 + xvpickve.w x2, res3, 6 + xvpickve.w x3, res3, 7 + xvfadd.s res3, VX0, res3 + xvfadd.s res3, VX1, res3 + xvfadd.s res3, VX2, res3 + xvfadd.s res3, VX3, res3 + xvfadd.s res3, x1, res3 + xvfadd.s res3, x2, res3 + xvfadd.s res3, x3, res3 + xvpickve.w VX0, res4, 1 + xvpickve.w VX1, res4, 2 + xvpickve.w VX2, res4, 3 + xvpickve.w VX3, res4, 4 + xvpickve.w x1, res4, 5 + xvpickve.w x2, res4, 6 + xvpickve.w x3, res4, 7 + xvfadd.s res4, VX0, res4 + xvfadd.s res4, VX1, res4 + xvfadd.s res4, VX2, res4 + xvfadd.s res4, VX3, res4 + xvfadd.s res4, x1, res4 + xvfadd.s res4, x2, res4 + xvfadd.s res4, x3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S new file mode 100644 index 000000000..5feea12be --- /dev/null +++ b/kernel/loongarch64/cdot_lsx.S @@ -0,0 +1,397 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res4 $vr19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define x1 $vr20 +#define x2 $vr21 +#define x3 $vr22 +#define x4 $vr23 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res3, res3, res3 + vxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 1 +#else + srai.d I, N, 2 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + addi.d X, X, 4 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + addi.d X, X, 8 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + addi.d Y, Y, 4 * SIZE + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d Y, Y, 8 * SIZE + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + vreplvei.d VX1, res2, 1 + vfadd.d res2, VX1, res2 + vreplvei.d VX1, res3, 1 + vfadd.d res3, VX1, res3 + vreplvei.d VX1, res4, 1 + vfadd.d res4, VX1, res4 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + vreplvei.w VX1, res2, 1 + vreplvei.w VX2, res2, 2 + vreplvei.w VX3, res2, 3 + vfadd.s res2, VX1, res2 + vfadd.s res2, VX2, res2 + vfadd.s res2, VX3, res2 + vreplvei.w VX1, res3, 1 + vreplvei.w VX2, res3, 2 + vreplvei.w VX3, res3, 3 + vfadd.s res3, VX1, res3 + vfadd.s res3, VX2, res3 + vfadd.s res3, VX3, res3 + vreplvei.w VX1, res4, 1 + vreplvei.w VX2, res4, 2 + vreplvei.w VX3, res4, 3 + vfadd.s res4, VX1, res4 + vfadd.s res4, VX2, res4 + vfadd.s res4, VX3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From fbd612f8c4f3eceb16a645c8a5366af35e7b6a2e Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 20:07:58 +0800 Subject: [PATCH 076/191] loongarch64: Add ic/zamin optimization functions. --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/icamin_lasx.S | 555 +++++++++++++++++++++++ kernel/loongarch64/icamin_lsx.S | 425 +++++++++++++++++ 5 files changed, 988 insertions(+) create mode 100644 kernel/loongarch64/icamin_lasx.S create mode 100644 kernel/loongarch64/icamin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 599b4795c..e581e2e3e 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -133,6 +133,7 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.d #define XVFADD xvfadd.d +#define XVFMUL xvfmul.d #define XVFMADD xvfmadd.d #define XVFMIN xvfmin.d #define XVFMINA xvfmina.d @@ -146,6 +147,7 @@ static inline int WhereAmI(void){ #define VFSUB vfsub.d #define VFADD vfadd.d +#define VFMUL vfmul.d #define VFMADD vfmadd.d #define VFMIN vfmin.d #define VFMINA vfmina.d @@ -185,6 +187,7 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.s #define XVFADD xvfadd.s +#define XVFMUL xvfmul.s #define XVFMADD xvfmadd.s #define XVFMIN xvfmin.s #define XVFMINA xvfmina.s @@ -198,6 +201,7 @@ static inline int WhereAmI(void){ #define VFSUB vfsub.s #define VFADD vfadd.s +#define VFMUL vfmul.s #define VFMADD vfmadd.s #define VFMIN vfmin.s #define VFMINA vfmina.s diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c70120f9a..5e2632574 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lsx.S ISAMINKERNEL = iamin_lsx.S IDAMINKERNEL = iamin_lsx.S +ICAMINKERNEL = icamin_lsx.S +IZAMINKERNEL = icamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 98673ae09..20a4d9a7e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lasx.S ISAMINKERNEL = iamin_lasx.S IDAMINKERNEL = iamin_lasx.S +ICAMINKERNEL = icamin_lasx.S +IZAMINKERNEL = icamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S new file mode 100644 index 000000000..01abd45b2 --- /dev/null +++ b/kernel/loongarch64/icamin_lasx.S @@ -0,0 +1,555 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + xvreplve0.w VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmin.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmin.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmin.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + XVFMUL x3, VI4, x1 + XVFMUL x4, VI4, x2 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 +#endif + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S new file mode 100644 index 000000000..a08cd33c5 --- /dev/null +++ b/kernel/loongarch64/icamin_lsx.S @@ -0,0 +1,425 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 + vreplvei.w VM0, VM0, 0 + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 +#endif + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 3c53ded315901759f0ee2a77d07121c8905fb18d Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 20:26:01 +0800 Subject: [PATCH 077/191] loongarch64: Add c/znrm2 optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/cnrm2_lasx.S | 147 +++++++++++++ kernel/loongarch64/cnrm2_lsx.S | 155 ++++++++++++++ kernel/loongarch64/znrm2_lasx.S | 252 ++++++++++++++++++++++ kernel/loongarch64/znrm2_lsx.S | 260 +++++++++++++++++++++++ 6 files changed, 818 insertions(+) create mode 100644 kernel/loongarch64/cnrm2_lasx.S create mode 100644 kernel/loongarch64/cnrm2_lsx.S create mode 100644 kernel/loongarch64/znrm2_lasx.S create mode 100644 kernel/loongarch64/znrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 5e2632574..826588318 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 20a4d9a7e..b61ecd427 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE From 1ec5dded43b9d5e14875061f62d5edec480f9584 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 21:23:59 +0800 Subject: [PATCH 078/191] loongarch64: Add c/zrot optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/crot_lasx.S | 1079 ++++++++++++++++++++++ kernel/loongarch64/crot_lsx.S | 907 ++++++++++++++++++ 4 files changed, 1990 insertions(+) create mode 100644 kernel/loongarch64/crot_lasx.S create mode 100644 kernel/loongarch64/crot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 826588318..02ea4304e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -67,6 +67,8 @@ ZASUMKERNEL = casum_lsx.S SROTKERNEL = rot_lsx.S DROTKERNEL = rot_lsx.S +CROTKERNEL = crot_lsx.S +ZROTKERNEL = crot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index b61ecd427..462698f85 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -67,6 +67,8 @@ ZASUMKERNEL = casum_lasx.S SROTKERNEL = rot_lasx.S DROTKERNEL = rot_lasx.S +CROTKERNEL = crot_lasx.S +ZROTKERNEL = crot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S new file mode 100644 index 000000000..d4ec1e22c --- /dev/null +++ b/kernel/loongarch64/crot_lasx.S @@ -0,0 +1,1079 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr11 +#define x1 $xr12 +#define x2 $xr13 +#define x3 $xr14 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXC, t1 + xvreplgr2vr.d VXS, t2 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXC, t1 + xvreplgr2vr.w VXS, t2 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + xvst VXZ, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 +#else + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 + .align 3 + +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S new file mode 100644 index 000000000..126257edc --- /dev/null +++ b/kernel/loongarch64/crot_lsx.S @@ -0,0 +1,907 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr11 +#define x1 $vr12 +#define x2 $vr13 +#define x3 $vr14 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXC, t1 + vreplgr2vr.d VXS, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXC, t1 + vreplgr2vr.w VXS, t2 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 2 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: +#ifdef DOUBLE + srai.d I, N, 2 +#endif + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 +#endif + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 +#endif + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 +#endif + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L995 +#else + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 +#endif + .align 3 + +#ifdef DOUBLE + .L995: + andi I, N, 3 + bge $r0, I, .L999 + b .L998 + .align 3 + +#endif +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From edabb936681bff6fea0454aedf064c19c1db217f Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 29 Dec 2023 15:08:10 +0800 Subject: [PATCH 079/191] loongarch64: Refine axpby optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- .../{daxpby_lasx.S => axpby_lasx.S} | 475 ++++++++++++- .../loongarch64/{daxpby_lsx.S => axpby_lsx.S} | 503 +++++++++++++- kernel/loongarch64/saxpby_lasx.S | 597 ----------------- kernel/loongarch64/saxpby_lsx.S | 629 ------------------ 6 files changed, 931 insertions(+), 1281 deletions(-) rename kernel/loongarch64/{daxpby_lasx.S => axpby_lasx.S} (55%) rename kernel/loongarch64/{daxpby_lsx.S => axpby_lsx.S} (56%) delete mode 100644 kernel/loongarch64/saxpby_lasx.S delete mode 100644 kernel/loongarch64/saxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 02ea4304e..0fb0bb68f 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -54,8 +54,8 @@ DAXPYKERNEL = axpy_lsx.S CAXPYKERNEL = caxpy_lsx.S ZAXPYKERNEL = caxpy_lsx.S -SAXPBYKERNEL = saxpby_lsx.S -DAXPBYKERNEL = daxpby_lsx.S +SAXPBYKERNEL = axpby_lsx.S +DAXPBYKERNEL = axpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 462698f85..1a6a04532 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -54,8 +54,8 @@ DAXPYKERNEL = axpy_lasx.S CAXPYKERNEL = caxpy_lasx.S ZAXPYKERNEL = caxpy_lasx.S -SAXPBYKERNEL = saxpby_lasx.S -DAXPBYKERNEL = daxpby_lasx.S +SAXPBYKERNEL = axpby_lasx.S +DAXPBYKERNEL = axpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/axpby_lasx.S similarity index 55% rename from kernel/loongarch64/daxpby_lasx.S rename to kernel/loongarch64/axpby_lasx.S index 4b19703e7..f1d99cd3b 100644 --- a/kernel/loongarch64/daxpby_lasx.S +++ b/kernel/loongarch64/axpby_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA xvreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + xvreplgr2vr.w VXA, t1 + xvreplgr2vr.w VXB, t2 + xvreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,21 +85,22 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 .L111: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE @@ -77,6 +111,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -85,34 +126,46 @@ .L112: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA xvst VX0, Y, 0 * SIZE xvst VX1, Y, 4 * SIZE +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L112 b .L997 .align 3 .L113: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d I, I, -1 + addi.d Y, Y, 8 * SIZE blt $r0, I, .L113 b .L997 .align 3 .L114: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -122,21 +175,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -182,14 +236,59 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA @@ -208,14 +307,33 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -250,7 +368,6 @@ xvstelm.d VX2, YY, 0, 3 add.d YY, YY, INCY xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 1 @@ -258,12 +375,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -279,6 +440,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -287,21 +465,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -334,12 +513,43 @@ xvfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 xvst VX3, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -369,6 +579,35 @@ xvfmul.d VX1, VX1, VXA addi.d I, I, -1 xvst VX1, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -376,20 +615,27 @@ .L213: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -399,20 +645,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -481,12 +728,81 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -529,12 +845,56 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -577,12 +937,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -598,6 +1002,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -610,12 +1031,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/axpby_lsx.S similarity index 56% rename from kernel/loongarch64/daxpby_lsx.S rename to kernel/loongarch64/axpby_lsx.S index 9aafbaf2a..45154c262 100644 --- a/kernel/loongarch64/daxpby_lsx.S +++ b/kernel/loongarch64/axpby_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE vreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA vreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXA, t1 + vreplgr2vr.w VXB, t2 + vreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,15 +85,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 @@ -68,6 +101,7 @@ .L111: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vld VX3, Y, 2 * SIZE vfmul.d VX0, VX0, VXA @@ -86,6 +120,16 @@ vfmadd.d VX3, VX3, VXB, VX1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -95,6 +139,7 @@ .L112: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -106,6 +151,13 @@ vfmul.d VX3, VX3, VXA vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,7 +165,8 @@ b .L997 .align 3 -.L113: // ALPHA==0 BETA!=0\ +.L113: // ALPHA==0 BETA!=0 +#ifdef DOUBLE vld VX0, Y, 0 * SIZE vld VX1, Y, 2 * SIZE vfmul.d VX0, VX0, VXB @@ -126,6 +179,14 @@ vfmul.d VX3, VX3, VXB vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L113 @@ -134,9 +195,13 @@ .L114: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -146,21 +211,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -212,6 +278,53 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -220,6 +333,7 @@ .L122: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -242,6 +356,26 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 @@ -250,6 +384,7 @@ .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -294,12 +429,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -315,6 +495,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -323,21 +520,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -378,12 +576,47 @@ vfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -417,6 +650,37 @@ vfmul.d VX1, VX1, VXA addi.d I, I, -1 vst VX1, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -424,6 +688,7 @@ .L213: // ALPHA==0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX3, Y, 2 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB @@ -433,19 +698,30 @@ vld VX3, Y, 6 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -455,20 +731,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -541,12 +818,83 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -591,12 +939,57 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -641,12 +1034,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -662,6 +1100,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -674,12 +1129,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S deleted file mode 100644 index c5d1ff402..000000000 --- a/kernel/loongarch64/saxpby_lasx.S +++ /dev/null @@ -1,597 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define ALPHA $f0 -#define X $r5 -#define INCX $r6 -#define BETA $f1 -#define Y $r7 -#define INCY $r8 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define a2 $f13 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VXA $xr23 -#define VXB $xr9 -#define VXZ $xr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - xvreplgr2vr.w VXA, t1 - movfr2gr.s t2, BETA - xvreplgr2vr.w VXB, t2 - movfr2gr.s t3, a1 - xvreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 - b .L111 // ALPHA!=0 BETA!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L114 // ALPHA==0 BETA==0 - b .L113 // ALPHA==0 BETA!=0 - .align 3 - -.L111: // ALPHA!=0 BETA!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvfmadd.s VX2, VX2, VXB, VX0 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // ALPHA!=0 BETA==0 - xvld VX0, X, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // ALPHA==0 BETA!=0 - xvld VX2, Y, 0 * SIZE - xvfmul.s VX2, VX2, VXB - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // ALPHA==0 BETA==0 - xvst VXZ, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 - b .L121 // ALPHA!=0 BETA!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L124 // ALPHA==0 BETA==0 - b .L123 // ALPHA==0 BETA!=0 - .align 3 - -.L121: // ALPHA!=0 BETA!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX0, VX0, VXA - xvfmadd.s VX2, VX2, VXB, VX0 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // ALPHA!=0 BETA==0 - xvld VX0, X, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX2, VX2, VXB - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // ALPHA==0 BETA==0 - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 - b .L211 // ALPHA!=0 BETA!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L214 // ALPHA==0 BETA==0 - b .L213 // ALPHA==0 BETA!=0 - .align 3 - -.L211: // ALPHA!=0 BETA!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VXA, VX0 - xvfmadd.s VX2, VX2, VXB, VX0 - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VXA, VX0 - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // ALPHA==0 BETA!=0 - xvld VX2, Y, 0 * SIZE - xvfmul.s VX2, VX2, VXB - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // ALPHA==0 BETA==0 - xvst VXZ, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L214 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 - b .L221 // ALPHA!=0 BETA!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L224 // ALPHA==0 BETA==0 - b .L223 // ALPHA==0 BETA!=0 - .align 3 - -.L221: // ALPHA!=0 BETA!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX0, VX0, VXA - xvfmadd.s VX2, VX2, VXB, VX0 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX2, VX2, VXB - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // ALPHA==0 BETA==0 - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - addi.d I, I, -1 - fmul.s $f12, $f12, ALPHA - fmadd.s $f13, $f13, BETA, $f12 - fst.s $f13, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S deleted file mode 100644 index 7f8cea2dd..000000000 --- a/kernel/loongarch64/saxpby_lsx.S +++ /dev/null @@ -1,629 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define ALPHA $f0 -#define X $r5 -#define INCX $r6 -#define BETA $f1 -#define Y $r7 -#define INCY $r8 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define a2 $f13 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VXA $vr23 -#define VXB $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - vreplgr2vr.w VXA, t1 - movfr2gr.s t2, BETA - vreplgr2vr.w VXB, t2 - movfr2gr.s t3, a1 - vreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 - b .L111 // ALPHA!=0 BETA!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L114 // ALPHA==0 BETA==0 - b .L113 // ALPHA==0 BETA!=0 - .align 3 - -.L111: // ALPHA!=0 BETA!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vfmadd.s VX2, VX2, VXB, VX0 - vfmadd.s VX3, VX3, VXB, VX1 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // ALPHA!=0 BETA==0 - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // ALPHA==0 BETA!=0 - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX2, VX2, VXB - vfmul.s VX3, VX3, VXB - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // ALPHA==0 BETA==0 - vst VXZ, Y, 0 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 - b .L121 // ALPHA!=0 BETA!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L124 // ALPHA==0 BETA==0 - b .L123 // ALPHA==0 BETA!=0 - .align 3 - -.L121: // ALPHA!=0 BETA!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, VX0, VXA - vld VX1, X, 4 * SIZE - vfmadd.s VX2, VX2, VXB, VX0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX1, VX1, VXA - vfmadd.s VX3, VX3, VXB, VX1 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // ALPHA!=0 BETA==0 - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX2, VX2, VXB - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX3, VX3, VXB - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // ALPHA==0 BETA==0 - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 - b .L211 // ALPHA!=0 BETA!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L214 // ALPHA==0 BETA==0 - b .L213 // ALPHA==0 BETA!=0 - .align 3 - -.L211: // ALPHA!=0 BETA!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VXA, VX0 - vld VX3, Y, 4 * SIZE - vfmadd.s VX2, VX2, VXB, VX0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX2, Y, 0 * SIZE - vfmul.s VX1, VX1, VXA - vfmadd.s VX3, VX3, VXB, VX1 - addi.d I, I, -1 - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VXA, VX0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX0, Y, 0 * SIZE - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // ALPHA==0 BETA!=0 - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX2, VX2, VXB - vfmul.s VX3, VX3, VXB - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // ALPHA==0 BETA==0 - vst VXZ, Y, 0 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L214 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 - b .L221 // ALPHA!=0 BETA!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L224 // ALPHA==0 BETA==0 - b .L223 // ALPHA==0 BETA!=0 - .align 3 - -.L221: // ALPHA!=0 BETA!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, VX0, VXA - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vfmadd.s VX2, VX2, VXB, VX0 - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vfmadd.s VX3, VX3, VXB, VX1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VX0, VXA - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vstelm.w VX1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX2, VX2, VXB - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX3, VX3, VXB - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // ALPHA==0 BETA==0 - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - addi.d I, I, -1 - fmul.s $f12, $f12, ALPHA - fmadd.s $f13, $f13, BETA, $f12 - fst.s $f13, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 546f13558caad680fdd82778c106ebaa3bd7331c Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 29 Dec 2023 11:03:53 +0800 Subject: [PATCH 080/191] loongarch64: Add {c/z}swap and {c/z}sum optimization --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 + kernel/loongarch64/csum_lasx.S | 274 +++++++++++++++ kernel/loongarch64/csum_lsx.S | 266 ++++++++++++++ kernel/loongarch64/cswap_lasx.S | 394 +++++++++++++++++++++ kernel/loongarch64/cswap_lsx.S | 421 +++++++++++++++++++++++ 6 files changed, 1367 insertions(+) create mode 100644 kernel/loongarch64/csum_lasx.S create mode 100644 kernel/loongarch64/csum_lsx.S create mode 100644 kernel/loongarch64/cswap_lasx.S create mode 100644 kernel/loongarch64/cswap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 0fb0bb68f..01f8e4782 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lsx.S CNRM2KERNEL = cnrm2_lsx.S ZNRM2KERNEL = znrm2_lsx.S +CSWAPKERNEL = cswap_lsx.S +ZSWAPKERNEL = cswap_lsx.S + +CSUMKERNEL = csum_lsx.S +ZSUMKERNEL = csum_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 1a6a04532..a9e8abaf0 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lasx.S CNRM2KERNEL = cnrm2_lasx.S ZNRM2KERNEL = znrm2_lasx.S +CSWAPKERNEL = cswap_lasx.S +ZSWAPKERNEL = cswap_lasx.S + +CSUMKERNEL = csum_lasx.S +ZSUMKERNEL = csum_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S new file mode 100644 index 000000000..3e65f2c15 --- /dev/null +++ b/kernel/loongarch64/csum_lasx.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S new file mode 100644 index 000000000..8de8e27ca --- /dev/null +++ b/kernel/loongarch64/csum_lsx.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S new file mode 100644 index 000000000..d53773d5a --- /dev/null +++ b/kernel/loongarch64/cswap_lasx.S @@ -0,0 +1,394 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 4 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 6 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 4 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 6 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S new file mode 100644 index 000000000..62a869066 --- /dev/null +++ b/kernel/loongarch64/cswap_lsx.S @@ -0,0 +1,421 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From a5d0d21378f6da778b18c8f9c5fd4ad33d9b52d5 Mon Sep 17 00:00:00 2001 From: pengxu Date: Fri, 29 Dec 2023 15:10:01 +0800 Subject: [PATCH 081/191] loongarch64: Add zgemm and cgemm optimization --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 24 +- kernel/loongarch64/cgemm_kernel_2x2_lasx.S | 857 +++++++++++++++++++++ kernel/loongarch64/cgemm_kernel_2x2_lsx.S | 812 +++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_2_lasx.S | 193 +++++ kernel/loongarch64/cgemm_ncopy_2_lsx.S | 202 +++++ kernel/loongarch64/cgemm_tcopy_2_lasx.S | 218 ++++++ kernel/loongarch64/cgemm_tcopy_2_lsx.S | 218 ++++++ kernel/loongarch64/zgemm_kernel_2x2.S | 848 ++++++++++++++++++++ kernel/loongarch64/zgemm_kernel_2x2_lasx.S | 822 ++++++++++++++++++++ kernel/loongarch64/zgemm_ncopy_2_lasx.S | 196 +++++ kernel/loongarch64/zgemm_tcopy_2_lasx.S | 212 +++++ param.h | 8 +- 14 files changed, 4621 insertions(+), 5 deletions(-) create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lasx.S create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lasx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lasx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_2x2.S create mode 100644 kernel/loongarch64/zgemm_kernel_2x2_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_2_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_2_lasx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index e581e2e3e..b1426da79 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -144,6 +144,7 @@ static inline int WhereAmI(void){ #define XVCMPLT xvfcmp.clt.d #define XVMUL xvfmul.d #define XVMSUB xvfmsub.d +#define XVNMSUB xvfnmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -158,6 +159,7 @@ static inline int WhereAmI(void){ #define VCMPLT vfcmp.clt.d #define VMUL vfmul.d #define VMSUB vfmsub.d +#define VNMSUB vfnmsub.d #else @@ -198,6 +200,7 @@ static inline int WhereAmI(void){ #define XVCMPLT xvfcmp.clt.s #define XVMUL xvfmul.s #define XVMSUB xvfmsub.s +#define XVNMSUB xvfnmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -212,6 +215,7 @@ static inline int WhereAmI(void){ #define VCMPLT vfcmp.clt.s #define VMUL vfmul.s #define VMSUB vfmsub.s +#define VNMSUB vfnmsub.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 01f8e4782..c365e9a75 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -95,4 +95,16 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index a9e8abaf0..68360faaf 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -107,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S +ZGEMMONCOPY = zgemm_ncopy_2_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S -endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S new file mode 100644 index 000000000..e07f7dc64 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S @@ -0,0 +1,857 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFSUB +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFADD +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFADD +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFSUB +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvpermi.d U0, U0, 0xd8 + xvpermi.d U2, U2, 0xd8 + + xvst U0, TD, 0x00 + xvst U2, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S new file mode 100644 index 000000000..1cf4d87dc --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S @@ -0,0 +1,202 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + slli.d T0, TL, 0x01 + + srai.d I, N, 0x01 + beq I, ZERO, .L_N0 + +.L_J1: /* if (i > 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..e2245e412 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvst U0, S8, 0x00 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + xvst U2, S8, 0x00 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S8, 0x00 + vst $vr1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld $vr0, TS, 0x00 + vld $vr1, TS, 0x10 + + vst $vr0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst $vr1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, TS, 0x00 + + vst $vr0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S new file mode 100644 index 000000000..15c0fde8f --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U2, S8, 0x10 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + vst U1, S8, 0x00 + vst U3, S8, 0x10 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, TS, 0x00 + + vst U0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_kernel_2x2.S b/kernel/loongarch64/zgemm_kernel_2x2.S new file mode 100644 index 000000000..589d170c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_2x2.S @@ -0,0 +1,848 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 +#define c51 $f26 +#define c52 $f27 +#define c61 $f28 +#define c62 $f29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -88 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + ST $f23, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset1 + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TS, TS, 0x40 // a_offset + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + vld $vr0, TS, 0x00 + vst $vr0, TD, 0x00 + + + addi.d TS, TS, 0x10 // a_offset + addi.d TD, TD, 0x10 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..3fe17beef --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S @@ -0,0 +1,212 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U2, S8, 0x20 + + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + xvst U1, S8, 0x00 + xvst U3, S8, 0x20 + + add.d S8, S8, T0 + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S9, S9, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + + slli.d T0, M, 0x05 + add.d TD, TD, T0 + + xvst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, TS, 0x00 + + vst $vr0, S9, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index e4e242d5d..54760a82e 100644 --- a/param.h +++ b/param.h @@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 From 0f648ebcd15d4a577eae9584e95b804acc25cc3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Dec 2023 20:31:32 +0100 Subject: [PATCH 082/191] use alternate download for the CLFS cross-compiler package --- .github/workflows/loongarch64.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 42393eb0c..f1bf8064c 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -40,7 +40,8 @@ jobs: - name: Download and install loongarch64-toolchain run: | - wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt - name: Set env From 44b5b9e39fa9bc98eb880d3f781ee4c337459ec6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Dec 2023 22:50:40 +0100 Subject: [PATCH 083/191] Update C/ZGEMM MN for Loongson2k1000 --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index 03cea76c9..03d6fa11d 100644 --- a/param.h +++ b/param.h @@ -2891,11 +2891,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 From 2802478449b7ea9bc7d1517c055635e8037908dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Dec 2023 23:35:51 +0100 Subject: [PATCH 084/191] revert change to Loongson2k1000 zgemm --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 03d6fa11d..f3731c491 100644 --- a/param.h +++ b/param.h @@ -2894,8 +2894,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 From d3451af03f97a5087cbfcdc1ca83cdf07616d7b8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:35:37 +0100 Subject: [PATCH 085/191] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/cbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f index 1cd1ffbf1..4d0c45efe 100644 --- a/lapack-netlib/SRC/cbbcsd.f +++ b/lapack-netlib/SRC/cbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), From bd787c8a1a7039fe4c0e6df5c1d9ff70738342dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:36:47 +0100 Subject: [PATCH 086/191] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/dbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f index 4fcf9ab5d..913f96a73 100644 --- a/lapack-netlib/SRC/dbbcsd.f +++ b/lapack-netlib/SRC/dbbcsd.f @@ -805,7 +805,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), From 1b668479deb9e1ef229c8f40b96d6f56b60402c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:37:52 +0100 Subject: [PATCH 087/191] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/sbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f index d5720cb33..2a619cb71 100644 --- a/lapack-netlib/SRC/sbbcsd.f +++ b/lapack-netlib/SRC/sbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), From 00d7476b4beba9cbd0cceac89e483e14664b9037 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:39:21 +0100 Subject: [PATCH 088/191] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/zbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f index 757e98c71..6601f4a06 100644 --- a/lapack-netlib/SRC/zbbcsd.f +++ b/lapack-netlib/SRC/zbbcsd.f @@ -804,7 +804,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), From 4a15d7242037862053a6332844f3838cd1ec64e3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 16:30:57 +0100 Subject: [PATCH 089/191] AzureCI: Update alpine-chroot-install (#4403) * Update alpine-chroot-install --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 317bc504a..3ae8615a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -288,9 +288,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ - && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ - || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \ + && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 From 504f9b0c5e4d7a9f6d87639d18dfb193e5b0b980 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 18:46:21 +0100 Subject: [PATCH 090/191] Increase S/D GEMM PQ to match typical L2 size as forNeoverseV1 --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index f3731c491..469c38ce3 100644 --- a/param.h +++ b/param.h @@ -3359,13 +3359,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 From 03713bc464bf20a57ad777a41820c608fb5d0e8c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:08:49 +0100 Subject: [PATCH 091/191] Update Changelog for 0.3.26 --- Changelog.txt | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index e0fe0ca5a..b6139d6b7 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,49 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.26 + 2-Jan-2024 + +general: +- improved the version of openblas.pc that is created by the CMAKE build +- fixed a CMAKE-specific build problem on older versions of MacOS +- worked around linking problems on old versions of MacOS +- corrected installation location of the lapacke_mangling header in CMAKE builds +- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header +- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading +- imported additions and corrections from the Reference-LAPACK project: + - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941) + - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942) + - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959) + - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967) + - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736) + - fixed several memory leaks in the LAPACK testsuite (PR 953) + - fixed counting of testsuite results by the Python script (PR 954) + +x86-64: +- fixed computation of CASUM on SkylakeX and newer targets in the special + case that AVX512 is not supported by the compiler or operating environment +- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets +- worked around a problem in the pre-AVX kernels for GEMV +- sped up the thread management code on MS Windows + +arm64: +- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer +- sped up the thread management code on MS Windows +- sped up SGEMM and DGEMM on Neoverse V1 and N1 +- sped up ?DOT on SVE-capable targets +- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones +- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds + +power: +- improved the SGEMM kernel for POWER10 +- fixed compilation with (very) old versions of gcc +- fixed detection of old 32bit PPC targets in CMAKE-based builds +- added autodetection of the POWERPC 7400 subtype +- fixed CMAKE-based compilation for PPCG4 and PPC970 targets + +loongarch64: +- added and improved optimized kernels for almost all BLAS functions + ==================================================================== Version 0.3.25 12-Nov-2023 From fde8bb990372980b840758bcef714bf565cdfb68 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:24:33 +0100 Subject: [PATCH 092/191] Update version to 0.3.26 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f3eac2edf..6d09f5658 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 25.dev) +set(OpenBLAS_PATCH_VERSION 26) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 6c77e5e314474773a7749357b153caba4ec3817d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:25:05 +0100 Subject: [PATCH 093/191] Update Makefile.rule --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 58f02358e..daf2d958d 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.25.dev +VERSION = 0.3.26 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 4f5da84e2f447b466db2a3d468dbee8881cc23a0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:32:27 +0100 Subject: [PATCH 094/191] Update version to 0.3.26.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d09f5658..5a1e4b271 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 26) +set(OpenBLAS_PATCH_VERSION 26.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 1412d2deeb32cfc1d80150eba520a5bba915f1c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:33:01 +0100 Subject: [PATCH 095/191] Update version to 0.3.26.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index daf2d958d..8dbf5eab6 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.26 +VERSION = 0.3.26.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From e48627c999e3aadfbcdb2b1f1589d4e75840dfca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jan 2024 23:55:52 +0100 Subject: [PATCH 096/191] Add tests for ZSCAL with NaN and Inf arguments --- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_zscal.c | 49 +++++++++++++++++++++++++++++++++++++++++++ utest/utest_main2.c | 50 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 utest/test_zscal.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 2e32827d3..c47954ce4 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -15,6 +15,7 @@ else () test_dsdot.c test_dnrm2.c test_swap.c + test_zscal.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index f99035440..d0715c754 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_zscal.c b/utest/test_zscal.c new file mode 100644 index 000000000..b6310439f --- /dev/null +++ b/utest/test_zscal.c @@ -0,0 +1,49 @@ +#include "openblas_utest.h" +#include +#ifdef BUILD_COMPLEX16 + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +#endif diff --git a/utest/utest_main2.c b/utest/utest_main2.c index 4382bf159..8cb663190 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -617,6 +617,51 @@ CTEST(max, smax_zero){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT(isnan(nan[0]); + ASSERT(isnan(nan[1]); + ASSERT(isnan(nan[16]); + ASSERT(isnan(nan[17]); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT(isnan(i[0]); + ASSERT(isnan(i[1]); + ASSERT(isnan(i[16]); + ASSERT(isnan(i[17]); + } + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT(isnan(inf[0]); + ASSERT(isinf(inf[1]); + ASSERT(isnan(inf[16]); + ASSERT(isinf(inf[17]); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT(isnan(i[0]); + ASSERT(isinf(i[1]); + ASSERT(isnan(i[16]); + ASSERT(isinf(i[17]); +} + int main(int argc, const char ** argv){ CTEST_ADD (amax, samax); @@ -648,7 +693,10 @@ int main(int argc, const char ** argv){ CTEST_ADD (swap,zswap_inc_0); CTEST_ADD (swap,sswap_inc_0); CTEST_ADD (swap,cswap_inc_0); - + CTEST_ADD (zscal, i_nan); + CTEST_ADD (zscal, nan_i); + CTEST_ADD (zscal, i_inf); + CTEST_ADD (zscal, inf_i); int num_fail=0; num_fail=ctest_main(argc, argv); From def4996170815ced665af2d1d8bbc08490feb809 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 15:29:42 +0100 Subject: [PATCH 097/191] Fix handling of NAN and INF arguments --- kernel/x86_64/zscal.c | 61 +++++++++++++++++++------------------- kernel/x86_64/zscal_sse2.S | 1 + 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 45e3531b8..b8d2a8754 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #include "common.h" - +#include #if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zscal_microk_skylakex-2.c" @@ -222,12 +222,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_r == 0.0 ) { - BLASLONG n1 = n & -2; if ( da_i == 0.0 ) { - while(j < n1) { @@ -253,7 +251,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - while(j < n1) { @@ -356,49 +353,59 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[0] = da_r; alpha[1] = da_i; - + if ( da_r == 0.0 ) if ( da_i == 0 ) zscal_kernel_8_zero(n1 , alpha , x); else - zscal_kernel_8_zero_r(n1 , alpha , x); +// zscal_kernel_8_zero_r(n1 , alpha , x); + zscal_kernel_8(n1 , alpha , x); else - if ( da_i == 0 ) + if ( da_i == 0 && da_r == da_r) zscal_kernel_8_zero_i(n1 , alpha , x); else zscal_kernel_8(n1 , alpha , x); - + } i = n1 << 1; j = n1; - } - - - if ( da_r == 0.0 ) + + if ( da_r == 0.0 || da_r != da_r ) { - if ( da_i == 0.0 ) { - + FLOAT res=0.0; + if (da_r != da_r) res= da_r; while(j < n) { - - x[i]=0.0; - x[i+1]=0.0; + x[i]=res; + x[i+1]=res; i += 2 ; j++; } } - else + else if (da_r < -FLT_MAX || da_r > FLT_MAX) { + while(j < n) + { + x[i]=SNAN; + x[i+1]=da_r; + i += 2 ; + j++; + + } + + } else { while(j < n) { - temp0 = -da_i * x[i+1]; + if (x[i] < -FLT_MAX || x[i] > FLT_MAX) + temp0 = SNAN; x[i+1] = da_i * x[i]; - x[i] = temp0; + if ( x[i] == x[i]) //preserve NaN + x[i] = temp0; i += 2 ; j++; @@ -409,12 +416,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - - if ( da_i == 0.0 ) + if (da_i == 0.0) { - - while(j < n) - { + while(j < n) + { temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; @@ -422,15 +427,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, i += 2 ; j++; - } - + } } else { while(j < n) { - temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; @@ -445,5 +448,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return(0); } - - diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 223b1e439..d6a49136d 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -82,6 +82,7 @@ pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L100 + jp .L100 comisd %xmm1, %xmm15 jne .L100 From c9df62e88375fae17adf555d73a757e2e8f65cab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 17:49:40 +0100 Subject: [PATCH 098/191] Fix handling of NAN --- kernel/arm64/zscal.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 929455975..4bd43320d 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -223,7 +223,7 @@ zscal_begin: fcmp DA_I, #0.0 beq .Lzscal_kernel_RI_zero - b .Lzscal_kernel_R_zero +// b .Lzscal_kernel_R_zero .Lzscal_kernel_R_non_zero: From d3d99c34f24a426bd90dfd97f76683dcd0eb2006 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 17:56:51 +0100 Subject: [PATCH 099/191] Fix handling of NAN and INF --- kernel/mips/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index bca1155c1..d8cbffcf9 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,6 +63,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } + if (!isnan(x_ip] && !isinf(x[ip])) x[ip] = temp; ip += inc_x2; From 711433fcf0257cd07cf728c64e175960d0da38a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 18:01:58 +0100 Subject: [PATCH 100/191] Update zscal.c --- kernel/mips/zscal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index d8cbffcf9..127f4f444 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,9 +63,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } - if (!isnan(x_ip] && !isinf(x[ip])) +// if (!isnan(x_ip] && !isinf(x[ip])) x[ip] = temp; - + if ( da_r != da_r ) x[ip] = da_r; ip += inc_x2; } From 903589f84b97b3975fa29a0f2a2fcef418040721 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 18:37:00 +0100 Subject: [PATCH 101/191] Update zscal.c --- kernel/mips/zscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index 127f4f444..e76fb1ff1 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } -// if (!isnan(x_ip] && !isinf(x[ip])) +// if (!isnan(x[ip]) && !isinf(x[ip])) x[ip] = temp; if ( da_r != da_r ) x[ip] = da_r; ip += inc_x2; From 0c33b57f5fb3721b2c6aa6fb5c7ddd09e17d6542 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 18:40:19 +0100 Subject: [PATCH 102/191] Handle NAN in input --- kernel/riscv64/zscal_vector.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index d275b75f8..24b44952d 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,7 +59,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ + if (isnan(da_r)) { + gvl = VSETVL(n); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = inc_x * 2 * gvl; + vt = VFMVVF_FLOAT(da_r, gvl); + for(i=0,j=0; i < n/(gvl*2); i++){ + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); + + j += gvl*2; + ix += inc_xv*2; + } + for(; j < n; ){ + gvl = VSETVL(n-j); + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + j += gvl; + ix += inc_x * 2 * gvl; + } + else if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; From b08a208365f0bfbc8c85b85b4401d3a05f57a222 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 19:14:41 +0100 Subject: [PATCH 103/191] Update zscal_vector.c --- kernel/riscv64/zscal_vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 24b44952d..33cb5d7ea 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } - else if(da_r == 0.0 && da_i == 0.0){ + } else if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; From 2173356d5b63ea5e5f861bfebea387bc41d9e3b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 19:45:28 +0100 Subject: [PATCH 104/191] Update zscal_vector.c --- kernel/riscv64/zscal_vector.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 33cb5d7ea..a943135df 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,28 +59,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if (isnan(da_r)) { - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(da_r, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - } else if(da_r == 0.0 && da_i == 0.0){ + if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; @@ -101,6 +81,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } +#if 0 }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -129,6 +110,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); } + #endif }else if(da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); From 91bbde7f64ef9d176e69416897ce5e255c351ec8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:21:05 +0100 Subject: [PATCH 105/191] revert accidental direct commit to develop --- kernel/mips/zscal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index e76fb1ff1..bca1155c1 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,9 +63,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } -// if (!isnan(x[ip]) && !isinf(x[ip])) x[ip] = temp; - if ( da_r != da_r ) x[ip] = da_r; + ip += inc_x2; } From f052bd4705210ef6c7293447b236f3a634799ba1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:21:48 +0100 Subject: [PATCH 106/191] revert accidental direct commit to develop --- kernel/riscv64/zscal_vector.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index a943135df..d275b75f8 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,7 +59,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -81,7 +80,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } -#if 0 }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -110,7 +108,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); } - #endif }else if(da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); From acf17a825dadf070d595b901ae10d7f21db5aa25 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:26:16 +0100 Subject: [PATCH 107/191] Handle NAN in input --- kernel/mips/zscal.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index bca1155c1..d69477f97 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,8 +63,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } - x[ip] = temp; - + if ( da_r != da_r ) + x[ip] = da_r; + else + x[ip] = temp; + ip += inc_x2; } From f0808d856b69d750facf288460b65f0e9b0dde65 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:27:29 +0100 Subject: [PATCH 108/191] Handle NAN in input --- kernel/riscv64/zscal_vector.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index d275b75f8..77f4fc312 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -80,6 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } +#if 0 }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -97,6 +98,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_xv; } +#endif if(j < n){ gvl = VSETVL(n-j); v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); From cf8b03ae8b59d41ea74fd7f8f52efe9fed897c23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 23:09:57 +0100 Subject: [PATCH 109/191] Use NAN rather than SNAN for portability --- kernel/x86_64/zscal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index b8d2a8754..66c8a0d2b 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -388,8 +388,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_r < -FLT_MAX || da_r > FLT_MAX) { while(j < n) { - x[i]=SNAN; - x[i+1]=da_r; + x[i]= NAN; + x[i+1] = da_r; i += 2 ; j++; @@ -402,7 +402,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { temp0 = -da_i * x[i+1]; if (x[i] < -FLT_MAX || x[i] > FLT_MAX) - temp0 = SNAN; + temp0 = NAN; x[i+1] = da_i * x[i]; if ( x[i] == x[i]) //preserve NaN x[i] = temp0; From 5e7f714e93848c72faa2d2e658b8584aa395f603 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 08:17:40 +0100 Subject: [PATCH 110/191] Update zscal.c --- kernel/mips/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index d69477f97..b117db1dd 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -47,6 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = x[ip]; x[ip+1] = da_i * x[ip] ; } } From 25b0c48082a34bb2e5e9beebfdb65810e2e5affc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 09:49:18 +0100 Subject: [PATCH 111/191] Update zscal.c --- kernel/mips/zscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index b117db1dd..7bb261941 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -47,7 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; - if (isnan(x[ip]) || isinf(x[ip])) temp = x[ip]; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } From f637e1271391765b4e9fc87b958ecb4fd7541460 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 09:52:38 +0100 Subject: [PATCH 112/191] Handle INF and NAN --- kernel/riscv64/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/riscv64/zscal.c +++ b/kernel/riscv64/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } From 7ee1ee38e22904837afd3e51dd67f91cd499185f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 14:20:07 +0100 Subject: [PATCH 113/191] Handle NaN in input --- kernel/x86/zscal_sse.S | 1 + kernel/x86/zscal_sse2.S | 1 + 2 files changed, 2 insertions(+) diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index e011c98f5..89e36251e 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -87,6 +87,7 @@ xorps %xmm7, %xmm7 comiss %xmm0, %xmm7 jne .L100 # Alpha_r != ZERO + jp .L100 # Alpha_r NaN comiss %xmm1, %xmm7 jne .L100 # Alpha_i != ZERO diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index cc7ab6686..0bc61b209 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -98,6 +98,7 @@ xorps %xmm7, %xmm7 comisd %xmm0, %xmm7 jne .L100 + jp .L100 comisd %xmm1, %xmm7 jne .L100 From 1c31f56e5a23f6dd7ebfd35014fc5a604e4702cd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 16:11:25 +0100 Subject: [PATCH 114/191] Handle NAN --- kernel/x86/zscal.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S index 1eb518563..adb56edb9 100644 --- a/kernel/x86/zscal.S +++ b/kernel/x86/zscal.S @@ -98,7 +98,7 @@ fcomip %st(1), %st ffreep %st(0) jne .L30 - +jp .L30 EMMS pxor %mm0, %mm0 From 058dd2a4cb4a6139796e29629191db5f2899fa1e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 8 Jan 2024 14:16:09 -0600 Subject: [PATCH 115/191] Replace two vector loads with one vector pair load and fix endianess of stores - DGEMM versions. --- kernel/power/dgemm_tcopy_16_power8.S | 3 + kernel/power/dgemm_tcopy_macros_16_power10.S | 655 +++++++++++++++++++ kernel/power/dgemm_tcopy_macros_16_power8.S | 6 + 3 files changed, 664 insertions(+) create mode 100644 kernel/power/dgemm_tcopy_macros_16_power10.S diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index fc52e0202..7a3c04e8e 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -107,6 +107,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "dgemm_tcopy_macros_16_power10.S" +#endif #include "dgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/dgemm_tcopy_macros_16_power10.S b/kernel/power/dgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..3cb416e09 --- /dev/null +++ b/kernel/power/dgemm_tcopy_macros_16_power10.S @@ -0,0 +1,655 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs48, o0, A2 + lxvpx vs50, o32, A2 + addi A2, A2, 64 + + lxvpx vs4, o0, A3 + lxvpx vs6, o32, A3 + addi A3, A3, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + lxvpx vs12, o0, A2 + lxvpx vs2, o32, A2 + addi A2, A2, 64 + + lxvpx vs8, o0, A3 + lxvpx vs10, o32, A3 + addi A3, A3, 64 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 +#else + stxvd2x vs49, o0, T1 + stxvd2x vs48, o16, T1 + stxvd2x vs51, o32, T1 + stxvd2x vs50, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs12, o0, T1 + stxvd2x vs13, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 +#else + stxvd2x vs13, o0, T1 + stxvd2x vs12, o16, T1 + stxvd2x vs3, o32, T1 + stxvd2x vs2, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs4, o0, T1 + stxvd2x vs5, o16, T1 + stxvd2x vs6, o32, T1 + stxvd2x vs7, o48, T1 +#else + stxvd2x vs5, o0, T1 + stxvd2x vs4, o16, T1 + stxvd2x vs7, o32, T1 + stxvd2x vs6, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 +#else + stxvd2x vs9, o0, T1 + stxvd2x vs8, o16, T1 + stxvd2x vs11, o32, T1 + stxvd2x vs10, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + addi A1, A1, 64 + + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + addi A2, A2, 64 + + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + addi A3, A3, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x4', ` +#else +.macro COPY_4x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + lxvpx vs36, o0, A2 + addi A2, A2, 32 + + + lxvpx vs38, o0, A3 + addi A3, A3, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o0, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x4', ` +#else +.macro COPY_2x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o0, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x4', ` +#else +.macro COPY_1x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 6c5b8ed62..93b488fd0 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -275,6 +276,7 @@ define(`COPY_4x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -369,6 +371,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -512,6 +515,7 @@ define(`COPY_2x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -580,6 +584,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -675,6 +680,7 @@ define(`COPY_1x4', ` #else .endm #endif +#endif /********************************************************************************************** From 7df363e1e2daef0916faae7d0b399fc998e5433e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 00:08:52 +0100 Subject: [PATCH 116/191] temporarily disable the MSA C/ZSCAL kernels --- kernel/mips/KERNEL.P5600 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a6e06d67..f0fb5e087 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -103,8 +103,10 @@ endif ifdef HAVE_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c -CSCALKERNEL = ../mips/cscal_msa.c -ZSCALKERNEL = ../mips/zscal_msa.c +#CSCALKERNEL = ../mips/cscal_msa.c +#ZSCALKERNEL = ../mips/zscal_msa.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c else SSCALKERNEL = ../mips/scal.c DSCALKERNEL = ../mips/scal.c From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 00:10:56 +0100 Subject: [PATCH 117/191] Add compiler test and flag for AVX512BF16 capability --- c_check | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/c_check b/c_check index b5e4a9ad0..3e507be81 100755 --- a/c_check +++ b/c_check @@ -244,6 +244,7 @@ case "$data" in esac no_avx512=0 +no_avx512bf=0 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" @@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then } rm -rf "$tmpd" + if [ "$no_avx512" -eq 0 ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.c" + code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"' + printf "#include \n\nint main(void){ %s; }\n" "$code" >> "$tmpf" + if [ "$compiler" = "PGI" ]; then + args=" -tp cooperlake -c -o $tmpf.o $tmpf" + else + args=" -march=cooperlake -c -o $tmpf.o $tmpf" + fi + no_avx512bf=0 + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_avx512bf=1 + } + + rm -rf "$tmpd" + fi fi no_rv64gv=0 @@ -409,6 +429,7 @@ done [ "$makefile" = "-" ] && { [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" exit 0 @@ -437,6 +458,7 @@ done [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 00:12:46 +0100 Subject: [PATCH 118/191] Make AVX512 BFLOAT16 kernels conditional on compiler capability --- kernel/x86_64/KERNEL.COOPERLAKE | 3 ++- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index dba94aea8..22b042029 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -1,5 +1,5 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX - +ifneq ($(NO_AVX512BF16), 1) SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c @@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 3a832e917..0ab2b4ddc 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,5 +1,6 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE +ifneq ($(NO_AVX512BF16), 1) SBGEMM_SMALL_M_PERMIT = SBGEMM_SMALL_K_NN = SBGEMM_SMALL_K_B0_NN = @@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif From d1ead06bd8707723e643099aa1072aefa914f660 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 09:29:13 +0100 Subject: [PATCH 119/191] define NAN and INFINITY if needed --- utest/test_zscal.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/utest/test_zscal.c b/utest/test_zscal.c index b6310439f..8992eee90 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -2,6 +2,13 @@ #include #ifdef BUILD_COMPLEX16 +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + CTEST(zscal, i_nan) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; From b57627c27f027692a4cc9725784e1b0e2e51bc49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 12:03:08 +0100 Subject: [PATCH 120/191] Handle NAN and INF --- kernel/zarch/zscal.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index d39b8447e..d019d66df 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -233,9 +233,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n1) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; + if (isnan(x[i + inc_x]) || isinf(x[i + inc_x])) + temp1 = NAN; + else temp1 = -da_i * x[i + 1 + inc_x]; x[i + 1 + inc_x] = da_i * x[i + inc_x]; x[i + inc_x] = temp1; @@ -246,7 +252,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += inc_x; @@ -347,7 +356,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += 2; From 20413ee6ecb05b4746e107ec61614178e498509b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 13:11:13 +0100 Subject: [PATCH 121/191] Update zscal.c --- kernel/zarch/zscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index d019d66df..4160a1a76 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -329,7 +329,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, alpha, x); + zscal_kernel_8(n1, da_r, da_i, x); else if (da_i == 0) zscal_kernel_8_zero_i(n1, alpha, x); else From 2e2e538b7c493b727ed9ff160e18f086f5e17384 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:02:43 +0100 Subject: [PATCH 122/191] Add openblas_set_num_threads_local() and use of blas_omp_threads_local in OMP parallel regions --- cblas.h | 1 + common_thread.h | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cblas.h b/cblas.h index 8a5055cf8..ade2fca3a 100644 --- a/cblas.h +++ b/cblas.h @@ -12,6 +12,7 @@ extern "C" { /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +int openblas_set_num_threads_local(int num_threads); /*Get the number of threads on runtime.*/ int openblas_get_num_threads(void); diff --git a/common_thread.h b/common_thread.h index 9e7dae74a..d37fcb189 100644 --- a/common_thread.h +++ b/common_thread.h @@ -137,19 +137,20 @@ typedef struct blas_queue { extern int blas_server_avail; extern int blas_omp_number_max; +extern int blas_omp_threads_local; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; openmp_nthreads=omp_get_max_threads(); + if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local; #endif #ifndef USE_OPENMP if (blas_cpu_number == 1 -#endif -#ifdef USE_OPENMP - if (openmp_nthreads == 1 || omp_in_parallel() +#else + if (openmp_nthreads == 1 #endif ) return 1; From 87d31af2ae8ef42c924d33633968af6262ab4344 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:06:24 +0100 Subject: [PATCH 123/191] Add openblas_set_num_threads_local() --- driver/others/blas_server_omp.c | 1 + driver/others/openblas_set_num_threads.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 213531057..4d3d1e0de 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -69,6 +69,7 @@ int blas_server_avail = 0; int blas_omp_number_max = 0; +int blas_omp_threads_local = 1; extern int openblas_omp_adaptive_env(void); diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 0b57867b0..c938fde69 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef SMP_SERVER extern void openblas_set_num_threads(int num_threads) ; +extern int openblas_get_num_threads(void) ; void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } +int openblas_set_num_threads_local(int num_threads){ + int ret = openblas_get_num_threads(); + openblas_set_num_threads(num_threads); + blas_omp_threads_local=num_threads; + return ret; +} + + #else //Single thread @@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) { void openblas_set_num_threads_(int* num_threads){ } + +int openblas_set_num_threads_local(int num_threads){ + return 1; +} #endif From bf66af3dc0060af9a7b001470a5300efb8f8313b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:37:36 +0100 Subject: [PATCH 124/191] remove matrix dimension 6 from DGS to avoid spurious errors from FMA --- lapack-netlib/TESTING/dgd.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/dgd.in b/lapack-netlib/TESTING/dgd.in index ca24dd2a0..127b63789 100644 --- a/lapack-netlib/TESTING/dgd.in +++ b/lapack-netlib/TESTING/dgd.in @@ -1,6 +1,6 @@ DGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits From 9fab60d32f595350d599d5fe901dfaae2ce56592 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:39:05 +0100 Subject: [PATCH 125/191] Remove matrix dimension 6 from SGS to avoid spurious errors from FMA --- lapack-netlib/TESTING/sgd.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/sgd.in b/lapack-netlib/TESTING/sgd.in index 164b3cc35..efab6c257 100644 --- a/lapack-netlib/TESTING/sgd.in +++ b/lapack-netlib/TESTING/sgd.in @@ -1,6 +1,6 @@ SGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits From 8a9d492af7b26f6c94965301783f5895cae9cc3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jan 2024 19:58:49 +0100 Subject: [PATCH 126/191] Add default for blas_omp_threads_local --- driver/others/blas_server.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2fcb37192..2531c57e9 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void); /* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; From 152a6c43b6c61607cc91d5285f94dbef2575ff4f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jan 2024 19:59:55 +0100 Subject: [PATCH 127/191] Add blas_omp_threads_local --- driver/others/blas_server_win32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 40ff85abc..2d41af228 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -59,6 +59,8 @@ static CRITICAL_SECTION queue_lock; /* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ static BLASULONG server_lock = 0; From 0d2e486edfb6919afea1bba62f5ac3ebc460cee0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 15 Jan 2024 11:18:59 +0100 Subject: [PATCH 128/191] Handle NAN and INF --- kernel/arm/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } From 025a1b2c7b02c9984c331fde39eb27e492aeb8e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 15 Jan 2024 22:40:21 +0100 Subject: [PATCH 129/191] Only use mtune=native when not cross-compiling --- Makefile.arm64 | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 1b10446f7..ed52a9424 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -104,9 +104,15 @@ ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.4-a -mtune=native +FCOMMON_OPT += -march=armv8.4-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else @@ -138,9 +144,15 @@ ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.5-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.5-a -mtune=native +FCOMMON_OPT += -march=armv8.5-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else From 8f4e325ea82f5d5f3791b147208cbaae8dde8f64 Mon Sep 17 00:00:00 2001 From: Ian McInerney Date: Mon, 15 Jan 2024 23:42:03 +0000 Subject: [PATCH 130/191] Fix Clang sapphire rapids march flag --- Makefile.x86_64 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 702447ace..c0dbe84c8 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC) endif endif else ifeq ($(C_COMPILER), CLANG) - # cooperlake support was added in clang 12 + # sapphire rapids support was added in clang 12 ifeq ($(CLANGVERSIONGTEQ12), 1) - CCOMMON_OPT += -march=cooperlake + CCOMMON_OPT += -march=sapphirerapids ifneq ($(F_COMPILER), NAG) - FCOMMON_OPT += -march=cooperlake + FCOMMON_OPT += -march=sapphirerapids endif else # not supported in clang, fallback to avx512 CCOMMON_OPT += -march=skylake-avx512 From 9edb805e645d3530e907864e242a3f64a881b28a Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Tue, 16 Jan 2024 14:24:18 +0000 Subject: [PATCH 131/191] fix builds with t-head toolchains that use old versions of the intrinsics spec --- common_riscv64.h | 17 +++++- cpuid_riscv64.c | 4 +- kernel/riscv64/amax_vector.c | 18 +++--- kernel/riscv64/amin_vector.c | 18 +++--- kernel/riscv64/asum_vector.c | 20 ++++--- kernel/riscv64/axpby_vector.c | 16 +++--- kernel/riscv64/axpy_vector.c | 12 ++-- kernel/riscv64/copy_vector.c | 14 ++--- kernel/riscv64/dot_vector.c | 44 +++++++++------ kernel/riscv64/gemv_n_vector.c | 24 ++++---- kernel/riscv64/gemv_t_vector.c | 42 ++++++++------ kernel/riscv64/iamax_vector.c | 80 +++++++++++++++------------ kernel/riscv64/iamin_vector.c | 82 ++++++++++++++++------------ kernel/riscv64/imax_vector.c | 82 ++++++++++++++++------------ kernel/riscv64/imin_vector.c | 80 ++++++++++++++++----------- kernel/riscv64/izamax_vector.c | 94 ++++++++++++++++++-------------- kernel/riscv64/izamin_vector.c | 92 ++++++++++++++++++------------- kernel/riscv64/max_vector.c | 20 ++++--- kernel/riscv64/min_vector.c | 20 ++++--- kernel/riscv64/nrm2_vector.c | 64 ++++++++++++---------- kernel/riscv64/rot_vector.c | 36 ++++++------ kernel/riscv64/scal_vector.c | 14 ++--- kernel/riscv64/sum_vector.c | 32 +++++------ kernel/riscv64/swap_vector.c | 10 ++-- kernel/riscv64/symv_L_vector.c | 56 +++++++++++-------- kernel/riscv64/symv_U_vector.c | 60 +++++++++++--------- kernel/riscv64/zamax_vector.c | 25 +++++---- kernel/riscv64/zamin_vector.c | 25 +++++---- kernel/riscv64/zasum_vector.c | 22 +++++--- kernel/riscv64/zaxpby_vector.c | 32 +++++------ kernel/riscv64/zaxpy_vector.c | 20 +++---- kernel/riscv64/zcopy_vector.c | 12 ++-- kernel/riscv64/zdot_vector.c | 60 +++++++++++--------- kernel/riscv64/zgemv_n_vector.c | 28 +++++----- kernel/riscv64/zgemv_t_vector.c | 56 +++++++++++-------- kernel/riscv64/zhemv_LM_vector.c | 60 +++++++++++--------- kernel/riscv64/zhemv_UV_vector.c | 60 +++++++++++--------- kernel/riscv64/znrm2_vector.c | 51 +++++++++-------- kernel/riscv64/zrot_vector.c | 36 ++++++------ kernel/riscv64/zscal_vector.c | 32 +++++------ kernel/riscv64/zsum_vector.c | 16 +++--- kernel/riscv64/zswap_vector.c | 10 ++-- 42 files changed, 900 insertions(+), 696 deletions(-) diff --git a/common_riscv64.h b/common_riscv64.h index de79c8cab..f11e8b75d 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,12 +91,23 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) # include +#endif + +#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) +// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this +#define RISCV_0p10_INTRINSICS +#define RISCV_RVV(x) x +#else +#define RISCV_RVV(x) __riscv_ ## x +#endif + +#if defined(C910V) || defined(RISCV64_ZVL256B) # if !defined(DOUBLE) -# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) # else -# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) # endif #else # define EXTRACT_FLOAT(v) (v[0]) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 1b6b62f21..928b5ba92 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 -#define CPU_RISCV64_ZVL256B 2 +#define CPU_x280 2 +#define CPU_RISCV64_ZVL256B 3 static char *cpuname[] = { "RISCV64_GENERIC", "C910V", + "x280", "CPU_RISCV64_ZVL256B" }; diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index 81a39af32..b66d4871e 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -49,15 +49,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index c8ba75f4a..1c541f0fd 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -48,15 +48,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index d10bf99e6..995dbf9a1 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -49,16 +49,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index b77cb58fb..386c4a5f1 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -48,15 +48,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) -#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMULVF_FLOAT JOIN(__riscv_vfmul, _vf_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 3447107a6..e99ca8542 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -49,13 +49,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) -#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index 710e8670a..ccbd6e482 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -47,12 +47,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/4){ BLASLONG inc_xv = inc_x * gvl; - BLASLONG gvl3 = gvl * 3; + unsigned int gvl3 = gvl * 3; BLASLONG inc_xv3 = inc_xv * 3; for(i=0,j=0; i #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m8 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) #else -#define VSETVL(n) __riscv_vsetvl_e64m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m8 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index baf3d8f69..3b467a586 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index f3b922195..cd89c63ec 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -27,35 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 9977e2741..894c6a643 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 4301528bd..2dee5ab29 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -53,19 +53,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) +#endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) -#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 095b1c3df..df9a7a7e1 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -55,19 +55,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) +#endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) -#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 9136f0037..fca904d6a 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -53,17 +53,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 404f51fb3..d5ad974cf 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 20bfe74ec..d19e51118 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 9da60acb0..9e4a67b71 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) #endif diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index 57542714a..13b8fe378 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index f4acad770..104d3865d 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index 179454094..5d85ab3a4 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m2_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m2_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl) +#else +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -93,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); - v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); j += inc_av; ix += inc_xv; @@ -117,8 +125,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); - v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); } temp_r = VFMVFS_FLOAT(v_res_r); diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index e025120e5..117db7d84 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 0e1ea5436..7c6b63bf3 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index 437bf4246..8614f7539 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -52,37 +52,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VFABS JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VMFNE JOIN(__riscv_vmfne_vf_f,ELEN, LMUL, _b, MLEN) -#define VMFGT JOIN(__riscv_vmfgt_vv_f,ELEN, LMUL, _b, MLEN) -#define VMFEQ JOIN(__riscv_vmfeq_vv_f,ELEN, LMUL, _b, MLEN) -#define VCPOP JOIN(__riscv_vcpop, _m_b, MLEN, _, _) -#define VFREDMAX JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) -#define VFIRST JOIN(__riscv_vfirst, _m_b, MLEN, _, _) -#define VRGATHER JOIN(__riscv_vrgather, _vx_f, ELEN, LMUL, _) -#define VFDIV JOIN(__riscv_vfdiv, _vf_f, ELEN, LMUL, _) -#define VFDIV_M JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _mu) -#define VFMUL JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _) -#define VFMACC JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _) -#define VFMACC_M JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _mu) -#define VMSOF JOIN(__riscv_vmsof, _m_b, MLEN, _, _) -#define VMANDN JOIN(__riscv_vmandn, _mm_b, MLEN, _, _) -#define VFREDUSUM JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFABS JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VMFNE JOIN(RISCV_RVV(vmfne_vf_f),ELEN, LMUL, _b, MLEN) +#define VMFGT JOIN(RISCV_RVV(vmfgt_vv_f),ELEN, LMUL, _b, MLEN) +#define VMFEQ JOIN(RISCV_RVV(vmfeq_vv_f),ELEN, LMUL, _b, MLEN) +#define VCPOP JOIN(RISCV_RVV(vcpop), _m_b, MLEN, _, _) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAX(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl) +#define VFREDUSUM(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl) +#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _m) +#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _m) +#else +#define VFREDMAX JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFREDUSUM JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _mu) +#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _mu) +#endif +#define VFIRST JOIN(RISCV_RVV(vfirst), _m_b, MLEN, _, _) +#define VRGATHER JOIN(RISCV_RVV(vrgather), _vx_f, ELEN, LMUL, _) +#define VFDIV JOIN(RISCV_RVV(vfdiv), _vf_f, ELEN, LMUL, _) +#define VFMUL JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _) +#define VFMACC JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _) +#define VMSOF JOIN(RISCV_RVV(vmsof), _m_b, MLEN, _, _) +#define VMANDN JOIN(RISCV_RVV(vmandn), _mm_b, MLEN, _, _) #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif -#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v) +#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index c3afbc7cc..50751b343 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 5d9ab7b28..2034aafaa 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c index 7aab15105..ca0b02b5c 100644 --- a/kernel/riscv64/zsum_vector.c +++ b/kernel/riscv64/zsum_vector.c @@ -53,16 +53,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index d8980602d..02c98b588 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { From ec89466e14d3dff51128682b3975f8554439a2f1 Mon Sep 17 00:00:00 2001 From: Dirreke Date: Tue, 16 Jan 2024 23:45:06 +0800 Subject: [PATCH 132/191] Add CSKY support --- CONTRIBUTORS.md | 5 +- Makefile.csky | 4 + Makefile.prebuild | 4 + Makefile.system | 5 ++ TargetList.txt | 4 + c_check | 2 + c_check.pl | 7 ++ common.h | 4 + common_csky.h | 56 ++++++++++++++ ctest.c | 4 + getarch.c | 34 ++++++++- kernel/csky/KERNEL | 149 +++++++++++++++++++++++++++++++++++++ kernel/csky/Makefile | 1 + lapack/laswp/csky/Makefile | 13 ++++ param.h | 37 +++++++++ 15 files changed, 325 insertions(+), 4 deletions(-) create mode 100644 Makefile.csky create mode 100644 common_csky.h create mode 100644 kernel/csky/KERNEL create mode 100644 kernel/csky/Makefile create mode 100644 lapack/laswp/csky/Makefile diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 493747052..419f90dab 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -218,4 +218,7 @@ In chronological order: * [2022-08] Fix building from sources for QNX * Mark Seminatore - * [2023-11-09] Improve Windows threading performance scaling \ No newline at end of file + * [2023-11-09] Improve Windows threading performance scaling + +* Dirreke + * [2024-01-16] Add basic support for the CSKY architecture diff --git a/Makefile.csky b/Makefile.csky new file mode 100644 index 000000000..36162af2a --- /dev/null +++ b/Makefile.csky @@ -0,0 +1,4 @@ +ifeq ($(CORE), CK860FV) +CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static +endif diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f1274..83da8e2ce 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,10 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), CK860FV) +TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.system b/Makefile.system index e602eaf05..0088eaff5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -873,6 +873,11 @@ endif endif endif +ifeq ($(ARCH), csky) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings # diff --git a/TargetList.txt b/TargetList.txt index deef75819..c11b94fa5 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -133,3 +133,7 @@ E2K EV4 EV5 EV6 + +14.CSKY +CSKY +CK860FV diff --git a/c_check b/c_check index 3e507be81..59ab9bb13 100755 --- a/c_check +++ b/c_check @@ -91,6 +91,7 @@ case "$data" in *ARCH_ZARCH*) architecture=zarch ;; *ARCH_RISCV64*) architecture=riscv64 ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac defined=0 @@ -236,6 +237,7 @@ case "$data" in *ARCH_ARM*) architecture=arm ;; *ARCH_ZARCH*) architecture=zarch ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac binformat='bin32' diff --git a/c_check.pl b/c_check.pl index d9c36793c..6b89f06eb 100644 --- a/c_check.pl +++ b/c_check.pl @@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $defined = 0; @@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") { $binary = 64; } +if ($architecture eq "csky") { + $defined = 1; + $binary = 32; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/common.h b/common.h index 462c1d428..1f0b9e533 100644 --- a/common.h +++ b/common.h @@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_e2k.h" #endif +#ifdef ARCH_CSKY +#include "common_csky.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_csky.h b/common_csky.h new file mode 100644 index 000000000..3095dc781 --- /dev/null +++ b/common_csky.h @@ -0,0 +1,56 @@ +/***************************************************************************** +Copyright (c) 2011-2015, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_CSKY +#define COMMON_CSKY + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#endif \ No newline at end of file diff --git a/ctest.c b/ctest.c index 2ccae8dcc..cbc15326f 100644 --- a/ctest.c +++ b/ctest.c @@ -173,6 +173,10 @@ HAVE_C11 ARCH_E2K #endif +#if defined(__csky__) +ARCH_CSKY +#endif + #if defined(__EMSCRIPTEN__) ARCH_RISCV64 OS_WINDOWS diff --git a/getarch.c b/getarch.c index 87384c084..2d26da079 100644 --- a/getarch.c +++ b/getarch.c @@ -150,6 +150,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_EV4 */ /* #define FORCE_EV5 */ /* #define FORCE_EV6 */ +/* #define FORCE_CSKY */ +/* #define FORCE_CK860FV */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 @@ -1692,6 +1694,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_CSKY +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CSKY" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCSKY" \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "csky" +#define CORENAME "CSKY" +#endif + +#ifdef FORCE_CK860FV +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CK860V" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCK860FV " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ck860fv" +#define CORENAME "CK860FV" +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1766,7 +1795,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif - #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif @@ -1831,7 +1859,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1979,7 +2007,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL new file mode 100644 index 000000000..afa8a0881 --- /dev/null +++ b/kernel/csky/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c +LSAME_KERNEL = ../generic/lsame.c + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + + diff --git a/kernel/csky/Makefile b/kernel/csky/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/csky/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/lapack/laswp/csky/Makefile b/lapack/laswp/csky/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/csky/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index 469c38ce3..e048dabe7 100644 --- a/param.h +++ b/param.h @@ -3807,7 +3807,44 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SYMV_P 16 #endif +#if defined(CSKY) || defined(CK860FV) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef GENERIC From ec46ca7a4344dac05f16ce29e3be86368f595d0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Jan 2024 07:33:10 +0100 Subject: [PATCH 133/191] Support Arm Compiler for Linux as classic flang (#4436) * Support ArmCompilerforLinux as classic flang --- f_check | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/f_check b/f_check index dac34edee..81f598ffa 100755 --- a/f_check +++ b/f_check @@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then pathf90 pathf95 pgf95 pgf90 pgf77 pgfortran nvfortran flang egfortran - ifort nagfor ifx ftn crayftn" + ifort nagfor ifx ftn crayftn armflang" for list in $lists; do for p in $path; do @@ -85,7 +85,11 @@ else *Hewlett*) vendor=CRAY openmp='-fopenmp' - ;; + ;; + *Arm\ F90*) + vendor=FLANG + openmp='-fopenmp' + ;; *GNU*|*GCC*) v="${data#*GCC: *\) }" @@ -108,7 +112,7 @@ else if [ "$major" -ge 17 ]; then vendor=FLANGNEW fi - ;; + ;; *ifort*|*ifx*) vendor=INTEL openmp='-fopenmp' From d2fc4f3b4d7f41527bc7dc8f62e9aa6229cfac89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Jan 2024 20:59:24 +0100 Subject: [PATCH 134/191] Increase multithreading threshold by a factor of 50 --- interface/gemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemv.c b/interface/gemv.c index 1f0763579..2c121f130 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) + if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) nthreads = 1; else nthreads = num_cpu_avail(2); From a4992e09bc75cf95ea829c8ce3a8e2ea09d3d1fc Mon Sep 17 00:00:00 2001 From: Pierrick Bouvier Date: Thu, 18 Jan 2024 18:20:37 +0400 Subject: [PATCH 135/191] Fix utest compilation Introduced recently when adding new test cases for ZSCAL - include cblas is needed for cblas_zscal - ASSERT macro does not exist - missing closing ) --- utest/utest_main2.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/utest/utest_main2.c b/utest/utest_main2.c index 8cb663190..46a3b067d 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CTEST_SEGFAULT #define CTEST_ADD_TESTS_MANUALLY +#include "cblas.h" #include "openblas_utest.h" CTEST(amax, samax){ @@ -623,10 +624,10 @@ CTEST(zscal, i_nan) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; cblas_zscal(9, i, &nan, 1); - ASSERT(isnan(nan[0]); - ASSERT(isnan(nan[1]); - ASSERT(isnan(nan[16]); - ASSERT(isnan(nan[17]); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); } CTEST(zscal, nan_i) @@ -634,10 +635,10 @@ CTEST(zscal, nan_i) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; cblas_zscal(9, &nan, &i, 1); - ASSERT(isnan(i[0]); - ASSERT(isnan(i[1]); - ASSERT(isnan(i[16]); - ASSERT(isnan(i[17]); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); } CTEST(zscal, i_inf) @@ -645,10 +646,10 @@ CTEST(zscal, i_inf) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; cblas_zscal(9, i, &inf, 1); - ASSERT(isnan(inf[0]); - ASSERT(isinf(inf[1]); - ASSERT(isnan(inf[16]); - ASSERT(isinf(inf[17]); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); } CTEST(zscal, inf_i) @@ -656,10 +657,10 @@ CTEST(zscal, inf_i) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; cblas_zscal(9, &inf, &i, 1); - ASSERT(isnan(i[0]); - ASSERT(isinf(i[1]); - ASSERT(isnan(i[16]); - ASSERT(isinf(i[17]); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); } int main(int argc, const char ** argv){ From b193ea3d7b6ae9950d6c7c73937661318a719c26 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 18 Jan 2024 22:11:12 +0300 Subject: [PATCH 136/191] Fix BLAS and LAPACK tests for RVV 1.0 target, update to 0.12.0 intrincics * Update intrincics API to 0.12.0 version (Stride Segment Loads/Stores) * Fixed nrm2, axpby, ncopy, zgemv and scal kernels * Added zero size checks --- kernel/riscv64/axpby_rvv.c | 8 +- kernel/riscv64/copy_rvv.c | 2 +- kernel/riscv64/gemm_ncopy_8_rvv.c | 51 ++++- kernel/riscv64/gemm_tcopy_8_rvv.c | 71 +++--- kernel/riscv64/izamax_rvv.c | 23 +- kernel/riscv64/izamin_rvv.c | 23 +- kernel/riscv64/nrm2_rvv.c | 237 +++++++++++++------ kernel/riscv64/scal_rvv.c | 49 ++-- kernel/riscv64/symv_U_rvv.c | 2 +- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 39 ++-- kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 39 ++-- kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 37 +-- kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 33 ++- kernel/riscv64/zamax_rvv.c | 23 +- kernel/riscv64/zamin_rvv.c | 23 +- kernel/riscv64/zaxpby_rvv.c | 63 ++++-- kernel/riscv64/zaxpy_rvv.c | 76 +++++-- kernel/riscv64/zcopy_rvv.c | 40 ++-- kernel/riscv64/zdot_rvv.c | 49 +++- kernel/riscv64/zgemm_beta_rvv.c | 27 ++- kernel/riscv64/zgemm_ncopy_4_rvv.c | 72 ++++-- kernel/riscv64/zgemm_ncopy_rvv_v1.c | 18 +- kernel/riscv64/zgemm_tcopy_4_rvv.c | 60 +++-- kernel/riscv64/zgemm_tcopy_rvv_v1.c | 18 +- kernel/riscv64/zgemmkernel_rvv_v1x4.c | 144 +++++++++--- kernel/riscv64/zgemv_n_rvv.c | 50 ++-- kernel/riscv64/zgemv_t_rvv.c | 44 ++-- kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 33 ++- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 33 ++- kernel/riscv64/znrm2_rvv.c | 301 +++++++++++++++++++------ kernel/riscv64/zrot_rvv.c | 95 ++++++-- kernel/riscv64/zscal_rvv.c | 66 ++++-- kernel/riscv64/zsum_rvv.c | 23 +- kernel/riscv64/zswap_rvv.c | 62 ++--- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 33 ++- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 33 ++- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 31 ++- kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 110 ++++++--- kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 26 +-- param.h | 2 +- 46 files changed, 1628 insertions(+), 709 deletions(-) diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index a1dbdb0e4..d7fb86eab 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * { FLOAT_V_T vx, vy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( beta == 0.0 ) { if ( alpha == 0.0 ) { @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_y = inc_y * sizeof(FLOAT); size_t vl = VSETVL(n); vy = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) { + for ( ; n > 0; n -= vl, y += vl*inc_y) { vl = VSETVL(n); VSSEV_FLOAT(y, stride_y, vy, vl); } @@ -126,10 +126,12 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } else { if ((1 == inc_x) && (1 == inc_y)) { - for (size_t vl; n > 0; n -= vl, y += vl) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); vy = VLEV_FLOAT(y, vl); vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); VSEV_FLOAT (y, vy, vl); } } else if (1 == inc_x) { diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c index 041fd2dae..9d4b84095 100644 --- a/kernel/riscv64/copy_rvv.c +++ b/kernel/riscv64/copy_rvv.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); FLOAT_V_T v0; diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c index 3030d67fb..c652ab0c0 100644 --- a/kernel/riscv64/gemm_ncopy_8_rvv.c +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -30,19 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/gemm_ncopy_8.c @@ -57,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) FLOAT *b_offset; FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -87,7 +103,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v7 = VLEV_FLOAT(a_offset7, vl); v8 = VLEV_FLOAT(a_offset8, vl); - VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); a_offset1 += vl; a_offset2 += vl; @@ -116,7 +141,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v3 = VLEV_FLOAT(a_offset3, vl); v4 = VLEV_FLOAT(a_offset4, vl); - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); a_offset1 += vl; a_offset2 += vl; @@ -137,7 +167,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v1 = VLEV_FLOAT(a_offset1, vl); v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += vl; a_offset2 += vl; diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 080a87312..4742ae6a7 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VLSEV_FLOAT __riscv_vlse32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VLSEV_FLOAT __riscv_vlse64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 8; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 8; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 32; @@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 8; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 16; @@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e93f0056c..32f66a7a7 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -32,10 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -61,10 +63,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -93,6 +97,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(max_index); FLOAT_V_T vx0, vx1, v_max; + FLOAT_VX2_T vxx2; UINT_V_T v_max_index; MASK_T mask; @@ -107,7 +112,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -129,7 +137,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index b5bc27404..d34b220fa 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -60,8 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -90,6 +94,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(min_index); FLOAT_V_T vx0, vx1, v_min; + FLOAT_VX2_T vxx2; UINT_V_T v_min_index; MASK_T mask; @@ -104,7 +109,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -127,7 +135,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 994fadb70..3eb423849 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -26,78 +26,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m8 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define ABS fabsf -#else -#define VSETVL(n) __riscv_vsetvl_e64m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m8 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 + +#if defined(DOUBLE) +#define VSETVL __riscv_vsetvl_e64m4 +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define MASK_T vbool16_t +#define VFABS __riscv_vfabs_v_f64m4 +#define VMFNE __riscv_vmfne_vf_f64m4_b16 +#define VMFGT __riscv_vmfgt_vv_f64m4_b16 +#define VMFEQ __riscv_vmfeq_vf_f64m4_b16 +#define VCPOP __riscv_vcpop_m_b16 +#define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1 +#define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1 +#define VFIRST __riscv_vfirst_m_b16 +#define VRGATHER __riscv_vrgather_vx_f64m4 +#define VFDIV __riscv_vfdiv_vv_f64m4 +#define VFDIV_M __riscv_vfdiv_vv_f64m4_mu +#define VFMUL __riscv_vfmul_vv_f64m4 +#define VFMUL_M __riscv_vfmul_vv_f64m4_mu +#define VFMACC __riscv_vfmacc_vv_f64m4 +#define VFMACC_M __riscv_vfmacc_vv_f64m4_mu +#define VMSBF __riscv_vmsbf_m_b16 +#define VMSOF __riscv_vmsof_m_b16 +#define VMAND __riscv_vmand_mm_b16 +#define VMANDN __riscv_vmand_mm_b16 +#define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1 +#define VMERGE __riscv_vmerge_vvm_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v) #define ABS fabs +#else +#define VSETVL __riscv_vsetvl_e32m4 +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define MASK_T vbool8_t +#define VFABS __riscv_vfabs_v_f32m4 +#define VMFNE __riscv_vmfne_vf_f32m4_b8 +#define VMFGT __riscv_vmfgt_vv_f32m4_b8 +#define VMFEQ __riscv_vmfeq_vf_f32m4_b8 +#define VCPOP __riscv_vcpop_m_b8 +#define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1 +#define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1 +#define VFIRST __riscv_vfirst_m_b8 +#define VRGATHER __riscv_vrgather_vx_f32m4 +#define VFDIV __riscv_vfdiv_vv_f32m4 +#define VFDIV_M __riscv_vfdiv_vv_f32m4_mu +#define VFMUL __riscv_vfmul_vv_f32m4 +#define VFMUL_M __riscv_vfmul_vv_f32m4_mu +#define VFMACC __riscv_vfmacc_vv_f32m4 +#define VFMACC_M __riscv_vfmacc_vv_f32m4_mu +#define VMSBF __riscv_vmsbf_m_b8 +#define VMSOF __riscv_vmsof_m_b8 +#define VMAND __riscv_vmand_mm_b8 +#define VMANDN __riscv_vmand_mm_b8 +#define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1 +#define VMERGE __riscv_vmerge_vvm_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v) +#define ABS fabsf #endif - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - - if( n <= 0 ) return(0.0); - if(n == 1) return (ABS(x[0])); - - FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_res; - FLOAT ssq = 0.0; - - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - - vr = VFMVVF_FLOAT(0, vlmax); - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); - - v0 = VLEV_FLOAT(x, vl); - - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + BLASLONG i=0; + + if (n <= 0 || inc_x <= 0) return(0.0); + if(n == 1) return (ABS(x[0])); + + unsigned int gvl = 0; + + MASK_T nonzero_mask; + MASK_T scale_mask; + + gvl = VSETVL(n); + FLOAT_V_T v0; + FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); + FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); + + FLOAT scale = 0; + FLOAT ssq = 0; + unsigned int stride_x = inc_x * sizeof(FLOAT); + int idx = 0; + + if( n >= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i 0; n -= vl, x += vl * inc_x) { - vl = VSETVL(n); - - v0 = VLSEV_FLOAT(x, stride_x, vl); - - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + //finish any tail using scalar ops + i*=gvl*inc_x; + n*=inc_x; + while(i < n){ + if ( x[i] != 0.0 ){ + FLOAT absxi = ABS( x[i] ); + if ( scale < absxi ){ + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else{ + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + + i += inc_x; } - } - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); + return(scale * sqrt(ssq)); +} - ssq = VFMVFS_FLOAT_M1(v_res); - return sqrt(ssq); -} diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index 2e2cfd31e..2c273fb63 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() #define FLOAT_V_T vfloat32m8_t #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 @@ -38,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() #define FLOAT_V_T vfloat64m8_t #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 @@ -54,26 +56,41 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T v0; if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); - - v0 = VLEV_FLOAT(x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSEV_FLOAT(x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + VSEV_FLOAT(x, v0, vl); + } } - - } else { + else { + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + } + } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); - for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { - vl = VSETVL(n); - - v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSSEV_FLOAT(x, stride_x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } } - } return 0; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 3cfd3ee4c..bcd2f6981 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; - + BLASLONG m1 = m - offset; if(inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 886af0c3b..869561fb3 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -140,6 +144,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; a += (m - 1) * m * 2; @@ -153,7 +158,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -165,12 +172,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = 0; k < i; k ++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -182,7 +193,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index ddeef966c..da443cfba 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -137,6 +141,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; @@ -149,7 +154,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -161,12 +168,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = i + 1; k < m; k++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -178,7 +189,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 4c83bbaa3..32e481036 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -31,13 +31,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +46,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +135,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -147,7 +150,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -159,13 +164,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = i + 1; k < n; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -177,7 +186,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index b368eefb9..81cc41818 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -42,10 +45,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +139,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -149,7 +156,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B pcj = c; for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -161,13 +170,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = 0; k < i; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -179,7 +192,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index bbb1e876b..180cf059a 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmax; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -95,7 +103,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index c5453121b..56a467502 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmin; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -94,7 +102,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index e0da55311..66e38c1e4 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -35,6 +35,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 @@ -42,13 +45,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 @@ -56,10 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) @@ -74,6 +80,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG stride_x = inc_x2 * sizeof(FLOAT); BLASLONG stride_y = inc_y2 * sizeof(FLOAT); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if ( beta_r == 0.0 && beta_i == 0.0) { @@ -81,10 +88,12 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL { size_t vl = VSETVL(n); FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) + vxx2 = VSET_VX2(vxx2, 0, temp); + vxx2 = VSET_VX2(vxx2, 1, temp); + for ( ; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); } } else @@ -92,7 +101,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); @@ -100,20 +112,26 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } } else { FLOAT_V_T v0, v1; + FLOAT_VX2_T v_x2; if ( alpha_r == 0.0 && alpha_i == 0.0 ) { for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vy1, beta_i, vl); v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); @@ -121,7 +139,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMULVF_FLOAT(vy1, beta_r, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } else @@ -129,8 +149,14 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); @@ -142,7 +168,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } } diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c index 3f75898e0..0db32df10 100644 --- a/kernel/riscv64/zaxpy_rvv.c +++ b/kernel/riscv64/zaxpy_rvv.c @@ -30,19 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -53,14 +59,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if(da_r == 0.0 && da_i == 0.0) return(0); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if(inc_x == 1 && inc_y == 1) { for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); @@ -72,7 +85,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1) { @@ -82,8 +97,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -96,7 +116,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1) { @@ -106,8 +128,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -120,7 +147,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -131,8 +160,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -145,7 +179,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c index bd94810ce..13879f03b 100644 --- a/kernel/riscv64/zcopy_rvv.c +++ b/kernel/riscv64/zcopy_rvv.c @@ -34,11 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 #define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T_M4 vfloat32m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T_M4 vfloat32m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T_M8 vfloat64m8_t @@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 #define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T_M4 vfloat64m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T_M4 vfloat64m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); if(inc_x == 1 && inc_y == 1) { @@ -70,34 +70,34 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) }else if (1 == inc_x) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSEG_FLOAT_M4(&vr, &vi, x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSEG_FLOAT_M4(x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } else if (1 == inc_y) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSEG_FLOAT_M4(y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSEG_FLOAT_M4(y, vx2, vl); } } else { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index fa0e89353..13bc2ee39 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -71,6 +75,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vyx2; size_t vlmax_m1 = VSETVL_MAX_M1; v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); @@ -83,8 +88,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -104,8 +114,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -124,8 +139,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -145,8 +165,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c index b94b5f4bf..ee334801b 100644 --- a/kernel/riscv64/zgemm_beta_rvv.c +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -41,8 +41,11 @@ #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 @@ -50,8 +53,11 @@ #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 @@ -68,6 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT *c_offset; size_t vl; FLOAT_V_T vr, vi, v1, v2, v3, v4; + FLOAT_VX2_T vx2; ldc *= 2; c_offset = c; @@ -77,6 +84,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vl = VSETVL(m); vr = VFMVVF_FLOAT(0.0, vl); vi = VFMVVF_FLOAT(0.0, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); for( ; n > 0; n--, c += ldc) { c_offset = c; @@ -84,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VSSEG_FLOAT(c_offset, vr, vi, vl); + VSSEG_FLOAT(c_offset, vx2, vl); } } @@ -96,7 +105,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VLSEG_FLOAT(&vr, &vi, c_offset, vl); + vx2 = VLSEG_FLOAT(c_offset, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); v1 = VFMULVF_FLOAT(vr, beta_r, vl); v2 = VFMULVF_FLOAT(vi, beta_i, vl); @@ -107,7 +118,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vr = VFSUBVV_FLOAT(v1, v2, vl); vi = VFADDVV_FLOAT(v3, v4, vl); - VSSEG_FLOAT(c_offset, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(c_offset, vx2, vl); } } diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c index d50a4b8d5..dce98752e 100644 --- a/kernel/riscv64/zgemm_ncopy_4_rvv.c +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -29,18 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VGET_VX2 __riscv_vget_v_f32m1x2_f32m1 +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VGET_VX2 __riscv_vget_v_f64m1x2_f64m1 +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/zgemm_ncopy_4.c @@ -53,7 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ FLOAT *boffset; - FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42; + FLOAT_VX2_T v1x2, v2x2, v3x2, v4x2; + FLOAT_VX4_T vxx4; + FLOAT_VX8_T vxx8; size_t vl; aoffset = a; @@ -69,12 +83,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); - VLSEG2_FLOAT(&v31, &v32, aoffset3, vl); - VLSEG2_FLOAT(&v41, &v42, aoffset4, vl); - - VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + v3x2 = VLSEG2_FLOAT(aoffset3, vl); + v4x2 = VLSEG2_FLOAT(aoffset4, vl); + + vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0)); + vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1)); + vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0)); + vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1)); + vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0)); + vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1)); + vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0)); + vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1)); + + VSSEG8_FLOAT(boffset, vxx8, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -91,10 +114,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + + vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0)); + vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1)); + vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0)); + vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1)); - VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl); + VSSEG4_FLOAT(boffset, vxx4, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -108,9 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); - VSSEG2_FLOAT(boffset, v11, v12, vl); + VSSEG2_FLOAT(boffset, v1x2, vl); aoffset1 += vl * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c index 1d3b8d3b7..275daa5f2 100644 --- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -30,14 +30,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ @@ -48,7 +48,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ FLOAT *a_offset1; FLOAT *b_offset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a_offset += vl * lda * 2; for(i = m; i > 0; i--) { - VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(b_offset, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += 2; b_offset += vl * 2; diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index 8c35b5616..cfafbf0dc 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -30,25 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ @@ -60,7 +66,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ IFLOAT *boffset, *boffset1, *boffset2, *boffset3; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -81,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -91,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -101,8 +111,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -119,8 +129,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -129,8 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -139,8 +149,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); //aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c index 7a085269c..96e986502 100644 --- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -29,14 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -47,7 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *aoffset1; IFLOAT *boffset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) aoffset += vl * 2; for(i = m; i > 0; i--) { - VLSEG2_FLOAT(&v0, &v1, aoffset1, vl); - VSSEG2_FLOAT(boffset, v0, v1, vl); + vx2 = VLSEG2_FLOAT(aoffset1, vl); + VSSEG2_FLOAT(boffset, vx2, vl); aoffset1 += lda * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c index 41399cf79..77e012ff5 100644 --- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -30,20 +30,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -80,6 +86,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b BLASLONG i,j,k; FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -109,10 +116,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -137,7 +148,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -162,7 +175,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -211,7 +226,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -237,35 +254,57 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); - VLSEG2_FLOAT(&va0, &va1, C2, vl); - VLSEG2_FLOAT(&va2, &va3, C3, vl); + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + vax2 = VLSEG2_FLOAT(C2, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C3, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -294,9 +333,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -311,7 +355,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -326,7 +372,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -356,7 +404,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -372,20 +422,31 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -409,9 +470,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -420,7 +486,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -430,7 +498,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -448,7 +518,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -458,12 +530,18 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); C0 += vl * 2; } diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c index 4a40c30a7..f14ef5ba8 100644 --- a/kernel/riscv64/zgemv_n_rvv.c +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -62,6 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a_ptr; FLOAT temp_r, temp_i; FLOAT_V_T va0, va1, vy0, vy1; + FLOAT_VX2_T vax2, vyx2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -73,7 +80,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -84,7 +94,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -113,7 +126,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSEG_FLOAT(y, vy0, vy1, vl); + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } @@ -123,7 +139,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -134,7 +152,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -163,7 +183,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } return(0); diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index 2f0380530..1c89a9f72 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -32,9 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f32m4_f32m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -46,9 +48,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f64m4_f64m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -66,6 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT_V_T va0, va1, vx0, vx1, vr, vi; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vax2; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; //BLASLONG stride_a = sizeof(FLOAT) * 2; @@ -73,6 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL(m); @@ -86,8 +92,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSEG_FLOAT(&x[ix], vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -104,9 +115,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) @@ -130,8 +141,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -148,9 +164,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index 79b20a646..97013895a 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -81,6 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -96,8 +103,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posY * 2 + posX * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -111,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index a86815275..59029e9e5 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -79,6 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -94,8 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -108,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index d2b27aa8d..32f67758a 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -28,95 +28,248 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define MASK_T vbool8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define MASK_T vbool16_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif -// TODO: Should single precision use the widening MAC, or perhaps all should be double? - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i=0, j=0; - if ( n <= 0 ) return(0.0); - - FLOAT_V_T vr, v0, v1; - FLOAT_V_T_M1 v_max, v_res; - FLOAT scale = 0.0, ssq = 0.0; - - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); + if (n <= 0 || inc_x <= 0) return(0.0); - vr = VFMVVF_FLOAT(0, vlmax); + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; if (inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - - VLSEG_FLOAT(&v0, &v1, x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for (i=0,j=0; i 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + //ssq in vector vr + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + j += gvl; + idx += inc_v; + } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + //tail + if (j < n) { + gvl = VSETVL(n-j); + v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if(index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + } else { // found greater element + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) {//no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); } - } - - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); - - ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - - return(scale * sqrt(ssq)); + return(scale * sqrt(ssq)); } diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c index ee81bfe91..1d5390684 100644 --- a/kernel/riscv64/zrot_rvv.c +++ b/kernel/riscv64/zrot_rvv.c @@ -30,28 +30,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -63,6 +69,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if (n <= 0) return(0); FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2, vtx2; if (inc_x == 0 && inc_y == 0) { BLASLONG i=0; @@ -93,8 +100,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -105,8 +117,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1){ @@ -115,8 +132,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -127,8 +149,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1){ @@ -137,8 +164,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -149,8 +181,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -160,8 +197,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -172,8 +214,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 779fab68c..2586c6036 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 @@ -43,10 +46,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -61,6 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F FLOAT_V_T vt, vr, vi; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); size_t vlmax = VSETVL_MAX; + FLOAT_VX2_T vx2; if(da_r == 0.0 && da_i == 0.0) { @@ -71,16 +78,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - VSSEG_FLOAT(x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } @@ -89,12 +98,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vi, -da_i, vl); vi = VFMULVF_FLOAT(vr, da_i, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else if(da_i == 0.0) { @@ -102,12 +116,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vr = VFMULVF_FLOAT(vr, da_r, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else { @@ -117,14 +135,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vr, &vi, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSEG_FLOAT(x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { @@ -132,14 +154,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } } diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index b41f70eb5..489188bd5 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -32,8 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -44,8 +46,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -59,6 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(sumf); FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vlmax = VSETVL_MAX; FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); @@ -67,7 +72,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); @@ -80,7 +88,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c index 17b7b9f43..c2adf5e05 100644 --- a/kernel/riscv64/zswap_rvv.c +++ b/kernel/riscv64/zswap_rvv.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n <= 0) return(0); - FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if (inc_x == 0 && inc_y == 0) { if (n & 1) { @@ -75,8 +75,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); - VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vyx2, vl); } y[0] = temp[0]; y[1] = temp[1]; @@ -92,8 +92,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); - VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vxx2, vl); } x[0] = temp[0]; x[1] = temp[1]; @@ -103,11 +103,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_x == 1){ @@ -116,11 +116,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_y == 1){ @@ -129,11 +129,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } else { @@ -143,11 +143,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index 0f9e04869..f4d806190 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -70,6 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -85,15 +92,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index fdc693700..069551bb0 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -71,6 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -86,15 +93,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + 0 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index 7276618c5..ae664561b 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vint32m2_t #define VID_V_UINT __riscv_vid_v_i32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -69,6 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -98,8 +103,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X > posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -119,7 +124,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -128,7 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c index 72e8f2ce2..ab8d34337 100644 --- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -32,11 +32,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -46,11 +48,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -65,6 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; #ifdef UNIT @@ -101,8 +106,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON else if (X < posY) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; @@ -115,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (unsigned int j = 0; j < vl; j++) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +132,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c index e6d36c86d..ba6e63b96 100644 --- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -67,6 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -96,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X < posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -118,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -127,7 +135,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c index 7085cfc37..a624fff54 100644 --- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -34,11 +34,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -48,11 +50,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -66,6 +70,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, j, js, X; FLOAT *ao; + + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; #ifdef UNIT VBOOL_T vbool_eq; @@ -103,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else if (X > posY) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; X++; @@ -115,7 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +133,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c index 92b4b855b..db5f06af8 100644 --- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -30,10 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 @@ -41,10 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -85,6 +91,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b off = 0; #endif + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -130,10 +137,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -158,7 +169,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -183,7 +196,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -233,7 +248,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -262,25 +279,37 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); va0 = VFMULVF_FLOAT(vres4, alphar, vl); va1 = VFMULVF_FLOAT(vres5, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMULVF_FLOAT(vres6, alphar, vl); va3 = VFMULVF_FLOAT(vres7, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -342,10 +371,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -360,7 +393,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -375,7 +410,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -405,7 +442,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -425,13 +464,19 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -487,10 +532,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -500,7 +549,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -510,7 +561,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -530,7 +583,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -545,7 +600,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c index 383cb883f..36cec711d 100644 --- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG stride_lda = sizeof(FLOAT)*lda*2; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; size_t vl; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); ao += 2; @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c index f57e9f1de..3a7bdb522 100644 --- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); b += vl * 2; ao += lda * 2; @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c index be3613429..2a158d4de 100644 --- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -31,20 +31,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -62,7 +62,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -83,9 +83,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT for (unsigned int j = 0; j < vl; j++) { compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); ao += 2; b += vl * 2; } @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c index b1f5ef8f0..4b3319588 100644 --- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -81,9 +81,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/param.h b/param.h index c5c70b78e..d93221d28 100644 --- a/param.h +++ b/param.h @@ -3057,7 +3057,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_MN 16 +#define CGEMM_DEFAULT_UNROLL_MN 32 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 4 From ff41cf5c49bda6bf84950812e737b2e7bcddf139 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 17 Mar 2023 14:28:26 +0300 Subject: [PATCH 137/191] Fix BLAS, BLAS-like functions and Generic RISC-V kernels * Fixed gemmt, imatcopy, zimatcopy_cnc functions * Fixed cblas_cscal testing in ctest * Removed rotmg unreacheble code * Added zero size checks --- cblas.h | 8 ++ common_interface.h | 9 ++ ctest/c_cblat1.f | 10 +- ctest/c_cblat1c.c | 6 +- interface/gemmt.c | 233 ++++++++++++++++++++++----------- interface/imatcopy.c | 8 +- interface/rotmg.c | 28 +--- interface/zimatcopy.c | 6 +- kernel/generic/zimatcopy_cnc.c | 1 - kernel/riscv64/axpby.c | 2 +- kernel/riscv64/axpy.c | 2 +- kernel/riscv64/copy.c | 2 +- kernel/riscv64/dot.c | 2 +- kernel/riscv64/swap.c | 2 +- kernel/riscv64/zaxpy.c | 2 +- kernel/riscv64/zcopy.c | 2 +- kernel/riscv64/zswap.c | 2 +- 17 files changed, 201 insertions(+), 124 deletions(-) diff --git a/cblas.h b/cblas.h index c2bdd27fa..f7d36788d 100644 --- a/cblas.h +++ b/cblas.h @@ -289,6 +289,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); +void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); +void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); diff --git a/common_interface.h b/common_interface.h index 318827920..61a82c306 100644 --- a/common_interface.h +++ b/common_interface.h @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); + int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 1a123d74d..cad7c7fa7 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -96,7 +96,7 @@ INTEGER ICAMAXTEST EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST * .. External Subroutines .. - EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 + EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. @@ -214,8 +214,8 @@ CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + STRUE4(NP1),SFAC) ELSE IF (ICASE.EQ.8) THEN -* .. CSCAL .. - CALL CSCAL(N,CA,CX,INCX) +* .. CSCALTEST .. + CALL CSCALTEST(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN @@ -236,14 +236,14 @@ * INCX = 1 IF (ICASE.EQ.8) THEN -* CSCAL +* CSCALTEST * Add a test for alpha equal to zero. CA = (0.0E0,0.0E0) DO 80 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 80 CONTINUE - CALL CSCAL(5,CA,CX,INCX) + CALL CSCALTEST(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * CSSCALTEST diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index 8c0dd140c..af29301af 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -685,7 +685,7 @@ real *sfac; static integer i__; extern /* Subroutine */ int ctest_(); static complex mwpcs[5], mwpct[5]; - extern /* Subroutine */ int itest1_(), stest1_(); + extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); static complex cx[8]; extern real scnrm2test_(); static integer np1; @@ -727,7 +727,7 @@ real *sfac; stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); } else if (combla_1.icase == 8) { /* .. CSCAL .. */ - cscal_(&combla_1.n, &ca, cx, &combla_1.incx); + cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); } else if (combla_1.icase == 9) { @@ -761,7 +761,7 @@ real *sfac; mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; /* L80: */ } - cscal_(&c__5, &ca, cx, &combla_1.incx); + cscaltest_(&c__5, &ca, cx, &combla_1.incx); ctest_(&c__5, cx, mwpct, mwpcs, sfac); } else if (combla_1.icase == 9) { /* CSSCALTEST */ diff --git a/interface/gemmt.c b/interface/gemmt.c index 3eed1dfe4..a4530721c 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -35,29 +35,26 @@ #include #include #include "common.h" -#ifdef FUNCTION_PROFILE -#include "functable.h" -#endif #ifndef COMPLEX #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE -#define ERROR_NAME "QGEMT " +#define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "DGEMT " +#define ERROR_NAME "DGEMMT " #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMT " +#define ERROR_NAME "SBGEMMT " #else -#define ERROR_NAME "SGEMT " +#define ERROR_NAME "SGEMMT " #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifdef XDOUBLE -#define ERROR_NAME "XGEMT " +#define ERROR_NAME "XGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "ZGEMT " +#define ERROR_NAME "ZGEMMT " #else -#define ERROR_NAME "CGEMT " +#define ERROR_NAME "CGEMMT " #endif #endif @@ -68,18 +65,22 @@ #ifndef CBLAS void NAME(char *UPLO, char *TRANSA, char *TRANSB, - blasint * M, blasint * N, blasint * K, + blasint * M, blasint * K, FLOAT * Alpha, IFLOAT * a, blasint * ldA, IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) { - blasint m, n, k; + blasint m, k; blasint lda, ldb, ldc; int transa, transb, uplo; blasint info; char transA, transB, Uplo; + blasint nrowa, nrowb; +#if defined(COMPLEX) + blasint ncolb; +#endif IFLOAT *buffer; IFLOAT *aa, *bb; FLOAT *cc; @@ -92,7 +93,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, PRINT_DEBUG_NAME; m = *M; - n = *N; k = *K; #if defined(COMPLEX) @@ -159,32 +159,47 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, if (Uplo == 'L') uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } + info = 0; - if (uplo < 0) - info = 14; - if (ldc < m) + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; - if (info) { + if (info != 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, - blasint N, blasint k, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, #ifndef COMPLEX FLOAT alpha, IFLOAT * A, blasint LDA, @@ -205,17 +220,23 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int transa, transb, uplo; blasint info; - blasint m, n, lda, ldb; + blasint lda, ldb; FLOAT *a, *b; +#if defined(COMPLEX) + blasint nrowb, ncolb; +#endif XFLOAT *buffer; PRINT_DEBUG_CNAME; + uplo = -1; transa = -1; transb = -1; info = 0; if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) transa = 0; @@ -248,9 +269,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, transb = 3; #endif - m = M; - n = N; - a = (void *)A; b = (void *)B; lda = LDA; @@ -258,23 +276,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint nrowa; +#if !defined(COMPLEX) + blasint nrowb; +#endif + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } + + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; } if (order == CblasRowMajor) { - m = N; - n = M; a = (void *)B; b = (void *)A; @@ -282,6 +319,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, lda = LDB; ldb = LDA; + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + if (TransB == CblasNoTrans) transa = 0; if (TransB == CblasTrans) @@ -315,29 +355,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint ncola; +#if !defined(COMPLEX) + blasint ncolb; +#endif + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; +#if defined(COMPLEX) + nrowb = m; +#endif + + if (transb & 1) { +#if defined(COMPLEX) + nrowb = k; +#endif + ncolb = m; + } + + if (ldc < MAX(1,m)) info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) info = 2; if (transa < 0) + info = 3; + if (uplo < 0) info = 1; - } - uplo = -1; - if (Uplo == CblasUpper) - uplo = 0; - if (Uplo == CblasLower) - uplo = 1; - if (uplo < 0) - info = 14; - if (info >= 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; @@ -407,37 +460,48 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif - if ((m == 0) || (n == 0)) + if (m == 0) return; IDEBUG_START; - FUNCTION_PROFILE_START(); +#if defined(COMPLEX) + if (transb > 1){ +#ifndef CBLAS + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#else + if (order == CblasColMajor) + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); + if (order == CblasRowMajor) + IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#endif + } +#endif - const blasint incb = (transb == 0) ? 1 : ldb; + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; if (uplo == 1) { - for (i = 0; i < n; i++) { - j = n - i; + for (i = 0; i < m; i++) { + j = m - i; l = j; #if defined(COMPLEX) aa = a + i * 2; bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i * 2; - bb = b + i * 2; } + if (transb & 1) + bb = b + i * 2; cc = c + i * 2 * ldc + i * 2; #else aa = a + i; bb = b + i * ldb; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i; - bb = b + i; } + if (transb & 1) + bb = b + i; cc = c + i * ldc + i; #endif @@ -447,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -458,8 +522,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -479,20 +541,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, aa, lda, bb, incb, cc, 1, buffer, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); } #endif @@ -501,21 +577,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } } else { - for (i = 0; i < n; i++) { + for (i = 0; i < m; i++) { j = i + 1; l = j; #if defined COMPLEX bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transb & 1) { bb = b + i * 2; } cc = c + i * 2 * ldc; #else bb = b + i * ldb; - if (transa) { - l = k; + if (transb & 1) { bb = b + i; } cc = c + i * ldc; @@ -527,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -537,8 +611,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -558,32 +630,41 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, a, lda, bb, incb, cc, 1, buffer, nthreads); - + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); } #endif STACK_FREE(buffer); } } - FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, - args.m * args.k + args.k * args.n + - args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END; return; -} +} \ No newline at end of file diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 91975f7f4..109280fe6 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -149,10 +149,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); - else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); + else + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/rotmg.c b/interface/rotmg.c index 3a5ca8f95..b8f627221 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ else { dp2 = *dd2 * dy1; - if(dp2 == ZERO) - { - dflag = -TWO; - dparam[0] = dflag; - return; - } dp1 = *dd1 * *dx1; dq2 = dp2 * dy1; dq1 = dp1 * *dx1; @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dh12 = dp2 / dp1; du = ONE - dh12 * dh21; - if(du > ZERO) - { - dflag = ZERO; - *dd1 = *dd1 / du; - *dd2 = *dd2 / du; - *dx1 = *dx1 * du; - } else { - dflag = -ONE; - - dh11 = ZERO; - dh12 = ZERO; - dh21 = ZERO; - dh22 = ZERO; - - *dd1 = ZERO; - *dd2 = ZERO; - *dx1 = ZERO; - } + dflag = ZERO; + *dd1 = *dd1 / du; + *dd2 = *dd2 / du; + *dx1 = *dx1 * du; } else diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index ecda5ef4e..7d73ba572 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -171,10 +171,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, } #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) diff --git a/kernel/generic/zimatcopy_cnc.c b/kernel/generic/zimatcopy_cnc.c index 8e772bd8a..6426cffc0 100644 --- a/kernel/generic/zimatcopy_cnc.c +++ b/kernel/generic/zimatcopy_cnc.c @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); - if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); aptr = a; lda *= 2; diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c index 278747f75..04f9518d3 100644 --- a/kernel/riscv64/axpby.c +++ b/kernel/riscv64/axpby.c @@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); ix = 0; iy = 0; diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c index fb1094dd9..19d12ad3f 100644 --- a/kernel/riscv64/axpy.c +++ b/kernel/riscv64/axpy.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c index 7b4f04f30..e79ca59af 100644 --- a/kernel/riscv64/copy.c +++ b/kernel/riscv64/copy.c @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG i=0; BLASLONG ix=0,iy=0; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); while(i < n) { diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index 46a84ad18..bf55998ca 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; - if ( n < 0 ) return(dot); + if ( n < 1 ) return(dot); while(i < n) { diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c index eac621fb2..33bbeeb6a 100644 --- a/kernel/riscv64/swap.c +++ b/kernel/riscv64/swap.c @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG ix=0,iy=0; FLOAT temp; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); while(i < n) { diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c index 1dcaeac27..18b6315cb 100644 --- a/kernel/riscv64/zaxpy.c +++ b/kernel/riscv64/zaxpy.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c index 07fe584c5..b0f19efd5 100644 --- a/kernel/riscv64/zcopy.c +++ b/kernel/riscv64/zcopy.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c index ae4760ae0..df1402b94 100644 --- a/kernel/riscv64/zswap.c +++ b/kernel/riscv64/zswap.c @@ -45,7 +45,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; From 5b4df851d7581145f0aee4336f11127a3a7acc8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Mar 2023 08:29:05 +0100 Subject: [PATCH 138/191] fix stray blank on continuation line --- interface/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 6f320d8f7..a4d3f710a 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -92,8 +92,9 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) \ - cspr2.$(SUFFIX) \ + csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + cspr.$(SUFFIX) cspr2.$(SUFFIX) \ + csymv.$(SUFFIX) csyr.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ From 1c04df20bd9c845160b3eb2e51adaceb6f93cf8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 20 Mar 2023 23:04:12 +0100 Subject: [PATCH 139/191] Re-enable overriding the LAPACK SYMV,SYR,SPMV and SPR implementations --- lapack-netlib/SRC/Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 49798b0c5..5f22789bd 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -572,22 +572,26 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o strtrs.o + slauum.o strti2.o strtri.o strtrs.o \ + ssymv.o ssyr.o sspmv.o sspr.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o dtrtrs.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o \ + dsymv.o dsyr.o dspmv.o dspr.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o ctrtrs.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o \ + csymv.o csyr.o cspmv.o cspr.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o ztrtrs.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o \ + zsymv.o zsyr.o zspmv.o zspr.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) From 5222b5fc18829265be7ffc77e77271a18f17c005 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 12 Oct 2023 22:06:00 +0300 Subject: [PATCH 140/191] Added axpby kernels for GENERIC RISC-V target --- kernel/riscv64/KERNEL.RISCV64_GENERIC | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 61a8a2b91..15bcd2289 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c CAXPYKERNEL = ../riscv64/zaxpy.c ZAXPYKERNEL = ../riscv64/zaxpy.c +SAXPBYKERNEL = ../riscv64/axpby.c +DAXPBYKERNEL = ../riscv64/axpby.c +CAXPBYKERNEL = ../riscv64/zaxpby.c +ZAXPBYKERNEL = ../riscv64/zaxpby.c + SCOPYKERNEL = ../riscv64/copy.c DCOPYKERNEL = ../riscv64/copy.c CCOPYKERNEL = ../riscv64/zcopy.c From f1291614536d7d1bec6508fda9b0c56dd7286bb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Mar 2023 07:43:03 +0100 Subject: [PATCH 141/191] restore C/Z SPMV, SPR, SYR,SYMV --- interface/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index a4d3f710a..3db4b2b6d 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -122,8 +122,9 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) \ - zspr2.$(SUFFIX) \ + zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zspr.$(SUFFIX) zspr2.$(SUFFIX) \ + zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ From 85548e66ca25228a73ec08c257d5d92108b94b62 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Mar 2023 16:33:09 +0200 Subject: [PATCH 142/191] Fix build failures seen with the NO_LAPACK option - cspr/csymv/csyr belong on the LAPACK list --- interface/Makefile | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 3db4b2b6d..6f320d8f7 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -92,9 +92,8 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ - cspr.$(SUFFIX) cspr2.$(SUFFIX) \ - csymv.$(SUFFIX) csyr.$(SUFFIX) \ + csbmv.$(SUFFIX) \ + cspr2.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ @@ -122,9 +121,8 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ - zspr.$(SUFFIX) zspr2.$(SUFFIX) \ - zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ + zsbmv.$(SUFFIX) \ + zspr2.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ From f7cf637d7aad0990625f41f83db74446a5908509 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Mar 2023 18:31:04 +0200 Subject: [PATCH 143/191] redo lost edit --- interface/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 6f320d8f7..275b71a1c 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -445,7 +445,8 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \ + cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX) #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ @@ -456,8 +457,8 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) - + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \ + zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX) XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ From f89e0034a479016ab5d9e1681abf07dab7f8cf38 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Wed, 20 Dec 2023 21:20:30 +0300 Subject: [PATCH 144/191] Fix LAPACK usage from BLAS --- interface/Makefile | 7 +++---- lapack-netlib/SRC/Makefile | 12 ++++-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 275b71a1c..6f320d8f7 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -445,8 +445,7 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \ - cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ @@ -457,8 +456,8 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \ - zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX) + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) + XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 5f22789bd..49798b0c5 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -572,26 +572,22 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o strtrs.o \ - ssymv.o ssyr.o sspmv.o sspr.o + slauum.o strti2.o strtri.o strtrs.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o dtrtrs.o \ - dsymv.o dsyr.o dspmv.o dspr.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o ctrtrs.o \ - csymv.o csyr.o cspmv.o cspr.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o ztrtrs.o \ - zsymv.o zsyr.o zspmv.o zspr.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) From ccbc3f875bc87e92c1ab05f3b361f64d7fd95c87 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Fri, 19 Jan 2024 12:40:00 +0000 Subject: [PATCH 145/191] [RISC-V] Add RISCV64_ZVL128B target to common_riscv64.h --- common_riscv64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_riscv64.h b/common_riscv64.h index f11e8b75d..4b5f7dcc4 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) # include #endif From aaf65210ccba0c53408c242a2e0f5ad5d798d532 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Fri, 19 Jan 2024 19:04:21 +0000 Subject: [PATCH 146/191] Add dynamic support for Arm(R) Neoverse(TM) V2 processor Whilst I figure out how best to map the L2 parameters without duplicating all of `ARMV8SVE`, lets just map this to `NEOVERSEV1`. --- driver/others/dynamic_arm64.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 803e0b5eb..6b21028d1 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,6 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ -/* Copyright 2023 The OpenBLAS Project */ +/* Copyright 2023-2024 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -143,12 +143,13 @@ extern gotoblas_t gotoblas_ARMV8SVE; #endif extern gotoblas_t gotoblas_THUNDERX3T110; #endif +#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 16 +#define NUM_CORETYPES 17 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -178,6 +179,7 @@ static char *corename[] = { "emag8180", "neoversen1", "neoversev1", + "neoversev2", "neoversen2", "thunderx3t110", "cortexa55", @@ -198,10 +200,11 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11]; - if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; - if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; - if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; - if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; + if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12]; + if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; return corename[NUM_CORETYPES]; } @@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_NEOVERSEV1); - case 12: return (&gotoblas_NEOVERSEN2); - case 13: return (&gotoblas_THUNDERX3T110); - case 14: return (&gotoblas_CORTEXA55); - case 15: return (&gotoblas_ARMV8SVE); + case 12: return (&gotoblas_NEOVERSEV2); + case 13: return (&gotoblas_NEOVERSEN2); + case 14: return (&gotoblas_THUNDERX3T110); + case 15: return (&gotoblas_CORTEXA55); + case 16: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_NEOVERSEN1; }else return &gotoblas_NEOVERSEV1; + case 0xd4f: + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + } else { + return &gotoblas_NEOVERSEV2; + } #endif case 0xd05: // Cortex A55 return &gotoblas_CORTEXA55; From 304a9b60afbc79be77193cc3bdb5bb5d503aa533 Mon Sep 17 00:00:00 2001 From: Han Gao/Revy/Rabenda Date: Sun, 21 Jan 2024 14:32:52 +0000 Subject: [PATCH 147/191] Update T-Head toolchains v2.8.0 Signed-off-by: Han Gao/Revy/Rabenda --- .github/workflows/c910v.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 30cf32b34..68ba2ddd7 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -14,8 +14,8 @@ jobs: if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest env: - xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 - toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz + xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618 + toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz strategy: fail-fast: false matrix: @@ -76,7 +76,7 @@ jobs: run: | wget ${xuetie_toolchain}/${toolchain_file_name} tar -xvf ${toolchain_file_name} -C /opt - export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH" + export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH" make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) From e1afb23811256b231c259ca57d7a5f6e81ac6da5 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 7 Apr 2023 11:13:23 +0300 Subject: [PATCH 148/191] Fix BLAS and LAPACK tests for C910V and RISCV64_ZVL256B targets * Fixed bugs in dgemm, [a]min\max, asum kernels * Added zero checks for BLAS kernels * Added dsdot implementation for RVV 0.7.1 * Fixed bugs in _vector files for C910V and RISCV64_ZVL256B targets * Added additional definitions for RISCV64_ZVL256B target --- Makefile.prebuild | 4 + Makefile.riscv64 | 4 + TargetList.txt | 1 + getarch.c | 14 +++ kernel/riscv64/KERNEL.C910V | 1 + kernel/riscv64/amin_vector.c | 6 +- kernel/riscv64/asum_vector.c | 7 +- kernel/riscv64/axpby_vector.c | 2 +- kernel/riscv64/dgemm_kernel_8x4_c910v.c | 2 +- kernel/riscv64/dsdot_vector.c | 152 ++++++++++++++++++++++++ kernel/riscv64/iamin_vector.c | 4 +- kernel/riscv64/izamin_vector.c | 2 +- kernel/riscv64/nrm2_vector.c | 2 +- kernel/riscv64/nrm2_vector_dot.c | 2 +- kernel/riscv64/swap_vector.c | 2 +- kernel/riscv64/zamax_vector.c | 17 +-- kernel/riscv64/zamin_vector.c | 17 +-- kernel/riscv64/znrm2_vector.c | 2 +- kernel/riscv64/zswap_vector.c | 2 +- 19 files changed, 205 insertions(+), 38 deletions(-) create mode 100644 kernel/riscv64/dsdot_vector.c diff --git a/Makefile.prebuild b/Makefile.prebuild index c4f4a2602..d30275f06 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif +ifeq ($(TARGET), RISCV64_ZVL256B) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +endif + ifeq ($(TARGET), RISCV64_GENERIC) TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce7a27141..2239a3676 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +ifeq ($(CORE), RISCV64_ZVL256B) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static diff --git a/TargetList.txt b/TargetList.txt index f76f605cc..f65a18b50 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -121,6 +121,7 @@ Z14 RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) C910V x280 +RISCV64_ZVL256B 11.LOONGARCH64: LOONGSONGENERIC diff --git a/getarch.c b/getarch.c index 772836347..12ea72052 100644 --- a/getarch.c +++ b/getarch.c @@ -1692,6 +1692,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_ZVL256B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL256B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL256B " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl256b" +#define CORENAME "RISCV64_ZVL256B" +#else +#endif + #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 0da66fa35..2798a870e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = dsdot_vector.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 1c541f0fd..c4578eabf 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -31,15 +31,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # define LMUL m2 # if defined(DOUBLE) # define ELEN 64 +# define ABS fabs # else # define ELEN 32 +# define ABS fabsf # endif #else # define LMUL m8 # if defined(DOUBLE) # define ELEN 64 +# define ABS fabs # else # define ELEN 32 +# define ABS fabsf # endif #endif @@ -69,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=0.0; if (n <= 0 || inc_x <= 0) return(minf); - minf = *x; + minf = ABS(*x); x += inc_x; --n; if (n == 0) return(minf); diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index 995dbf9a1..a652eafdd 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -67,7 +67,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; - BLASLONG ix=0; FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; @@ -103,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); - BLASLONG inc_xv = inc_x * gvl; for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + }else if(inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else if(inc_x == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else{ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + } + return(dot); +} diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index a58872960..0e591e697 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -139,7 +139,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); @@ -185,7 +185,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index a3877a46c..c76a38099 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -156,7 +156,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c index 141dffebf..5c03fbec7 100644 --- a/kernel/riscv64/nrm2_vector.c +++ b/kernel/riscv64/nrm2_vector.c @@ -104,7 +104,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; - if(n <= 0) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); if(n == 1) return (ABS(x[0])); unsigned int gvl = 0; diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c index 06e61d695..dfa13a6f5 100644 --- a/kernel/riscv64/nrm2_vector_dot.c +++ b/kernel/riscv64/nrm2_vector_dot.c @@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG i=0, j=0; double len = 0.0 ; - if ( n < 0 ) return(0.0); + if ( n <= 0 ) return(0.0); if(n == 1) return (ABS(x[0])); FLOAT_V_T vr, v0, v1; diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index 3b467a586..f583f5392 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -67,7 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - if (n < 0) return(0); + if (n <= 0) return(0); unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 2dee5ab29..ec4a5a1e9 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -60,17 +60,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) #ifdef RISCV_0p10_INTRINSICS #define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) -#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) #else #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) #endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) #define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -91,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for(; i Date: Tue, 23 Jan 2024 17:15:53 +0100 Subject: [PATCH 149/191] reset "mem structure overflowed" state on shutdown --- driver/others/memory.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index caef3e2b7..4ee8f9a2e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -3214,7 +3214,7 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } - if (memory_overflowed) + if (memory_overflowed) { for (pos = 0; pos < NEW_BUFFERS; pos ++){ newmemory[pos].addr = (void *)0; newmemory[pos].used = 0; @@ -3222,6 +3222,10 @@ void blas_shutdown(void){ newmemory[pos].pos = -1; #endif newmemory[pos].lock = 0; + } + free(newmemory); + newmemory = NULL; + memory_overflowed = 0; } UNLOCK_COMMAND(&alloc_lock); From 86943afa9cb96a49ad036c1a484390967b322b5e Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Wed, 24 Jan 2024 10:53:13 +0300 Subject: [PATCH 150/191] Fix x280 taget include riscv_vector.h --- common_riscv64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_riscv64.h b/common_riscv64.h index 4b5f7dcc4..ab3bfa25a 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) # include #endif From 73530b03fa6ecd03e7ceb2b37c234a0bb1626445 Mon Sep 17 00:00:00 2001 From: Andrey Sokolov Date: Wed, 24 Jan 2024 11:38:14 +0300 Subject: [PATCH 151/191] remove RISCV64_ZVL256B additional extentions --- Makefile.prebuild | 2 +- Makefile.riscv64 | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index 7824e15a8..98acca80e 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -60,7 +60,7 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif ifeq ($(TARGET), RISCV64_ZVL256B) -TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d endif ifeq ($(TARGET), RISCV64_ZVL128B) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 9d314d074..113cc57c5 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -7,8 +7,8 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL256B) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL128B) CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d From 276e3ebf9e1405196d36c2570bce8376b8bad2fd Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 26 Jan 2024 10:03:50 +0800 Subject: [PATCH 152/191] LoongArch64: Add dzamax and dzamin opt --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/camax_lasx.S | 150 +++++++++------- kernel/loongarch64/camax_lsx.S | 195 ++++++++++++--------- kernel/loongarch64/camin_lasx.S | 164 ++++++++++-------- kernel/loongarch64/camin_lsx.S | 209 +++++++++++++---------- 6 files changed, 418 insertions(+), 304 deletions(-) diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c365e9a75..e27ce3bee 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S CAMAXKERNEL = camax_lsx.S +ZAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S CAMINKERNEL = camin_lsx.S +ZAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 68360faaf..f4429cfba 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S CAMAXKERNEL = camax_lasx.S +ZAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S CAMINKERNEL = camin_lasx.S +ZAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S index 7013430cb..f9a4e9012 100644 --- a/kernel/loongarch64/camax_lasx.S +++ b/kernel/loongarch64/camax_lasx.S @@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmax.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMAX VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmax.s VM1, x1, x2 - xvfmax.s VM0, x3, x4 - xvfmax.s VM0, VM0, VM1 + XVFMAX VM0, x1, x2 + XVFMAX VM1, x3, x4 + XVFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S index 2e55629de..cf46cb016 100644 --- a/kernel/loongarch64/camax_lsx.S +++ b/kernel/loongarch64/camax_lsx.S @@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#endif addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmax.s VM1, x1, VM1 - vfmax.s VM0, VM0, VM1 + addi.d I, I, -1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMAX VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmax.s VM1, x1, x2 - vfmax.s VM0, x3, x4 - vfmax.s VM0, VM0, VM1 + VFMAX VM1, x1, x2 + VFMAX VM0, x3, x4 + VFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S index d7931d30a..c1c4c98c8 100644 --- a/kernel/loongarch64/camin_lasx.S +++ b/kernel/loongarch64/camin_lasx.S @@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 +#else xvreplve0.w VM0, VM0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMIN VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmin.s VM1, x1, x2 - xvfmin.s VM0, x3, x4 - xvfmin.s VM0, VM0, VM1 + XVFMIN VM0, x1, x2 + XVFMIN VM1, x3, x4 + XVFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S index e9ad6b04d..ff666ea8f 100644 --- a/kernel/loongarch64/camin_lsx.S +++ b/kernel/loongarch64/camin_lsx.S @@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + vreplvei.d VM0, VM0, 0 +#else vreplvei.w VM0, VM0, 0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmin.s VM1, x1, VM1 - vfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMIN VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmin.s VM1, x1, x2 - vfmin.s VM0, x3, x4 - vfmin.s VM0, VM0, VM1 + VFMIN VM1, x1, x2 + VFMIN VM0, x3, x4 + VFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 From ec2aa32eb069a8aa9b2013c1f6af6b4cddb1b0dc Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Thu, 25 Jan 2024 15:20:58 +0000 Subject: [PATCH 153/191] Fix crash in cpuid_riscv64.c The crash is reproducible when building OpenBLAS without forcing a target in a riscv64 container running on an X86_64 machine with an older version of QEMU, e.g., 7.0.0, registered with binfmt_misc to run riscv64 binaries. With this setup, cat /proc/cpuinfo in the container returns the cpu information for the host, which contains a "model name" string, and we execute the buggy code. The code in question is searching in an uninitialised buffer for the ':' character and doesn't check to see whether it was found or not. This can result in pmodel containing the pointer value 1 and a crash when pmodel is defererenced. The algorithm to detect the C910V CPU has not been modified, merely fixed to prevent the crash. A few additional checks for NULL pointers are added to improve the robustness of the code and a whitespace error is corrected. --- cpuid_riscv64.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873..7e08af831 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -86,23 +86,29 @@ int detect(void){ char *pmodel = NULL, *pisa = NULL; infile = fopen("/proc/cpuinfo", "r"); + if (!infile) + return CPU_GENERIC; while (fgets(buffer, sizeof(buffer), infile)){ if(!strncmp(buffer, "model name", 10)){ strcpy(model_buffer, buffer); - pmodel = strchr(isa_buffer, ':') + 1; + pmodel = strchr(model_buffer, ':'); + if (pmodel) + pmodel++; } if(!strncmp(buffer, "isa", 3)){ strcpy(isa_buffer, buffer); - pisa = strchr(isa_buffer, '4') + 1; + pisa = strchr(isa_buffer, '4'); + if (pisa) + pisa++; } } fclose(infile); - if (!pmodel) + if (!pmodel || !pisa) return(CPU_GENERIC); - + if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) return CPU_C910V; From e0b610d01ff4cdb44fd2453b7efa7f51ea84e575 Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Fri, 26 Jan 2024 13:57:33 +0000 Subject: [PATCH 154/191] Harmonize riscv64 LIBNAME for forced and non-forced targets The forced values for LIBNAME were either riscv64_generic or c910v while the non-forced value of LIBNAME was always riscv64. --- cpuid_riscv64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 7e08af831..13009753a 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -78,6 +78,11 @@ static char *cpuname[] = { "C910V" }; +static char *cpuname_lower[] = { + "riscv64_generic", + "c910v" +}; + int detect(void){ #ifdef __linux FILE *infile; @@ -146,5 +151,5 @@ void get_cpuconfig(void){ } void get_libname(void){ - printf("riscv64\n"); + printf("%s", cpuname_lower[detect()]); } From 48a4c4d454530d46c0143f4f9bd3f5f7a8374757 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Jan 2024 16:30:52 +0100 Subject: [PATCH 155/191] Use +sve in arch declarations of the fallback paths for SVE targets --- Makefile.arm64 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index ed52a9424..ca053b03d 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -116,13 +116,13 @@ endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif @@ -138,7 +138,7 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 @@ -156,13 +156,13 @@ endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif From 519ea6e87aa357787896986836f853192c829930 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 10:39:22 +0800 Subject: [PATCH 156/191] utest: Add utest for the {sc/dz}amax and {s/d/sc/dz}amin --- utest/CMakeLists.txt | 1 + utest/Makefile | 3 +- utest/test_amax.c | 35 +++++++++++++++-- utest/test_amin.c | 89 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 utest/test_amin.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index c47954ce4..41829bd22 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -16,6 +16,7 @@ else () test_dnrm2.c test_swap.c test_zscal.c + test_amin.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index d0715c754..8acaa3ea9 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,8 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ + test_amin.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_amax.c b/utest/test_amax.c index a9e5a1c85..e9775caf0 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2016, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -57,4 +57,31 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } #endif +#ifdef BUILD_COMPLEX +CTEST(amax, scamax){ + blasint N = 9, inc = 1; + float te_max = 0.0, tr_max = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + te_max = BLASFUNC(scamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amax, dzamax){ + blasint N = 9, inc = 1; + double te_max = 0.0, tr_max = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_max = BLASFUNC(dzamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} +#endif diff --git a/utest/test_amin.c b/utest/test_amin.c new file mode 100644 index 000000000..1305ab8ef --- /dev/null +++ b/utest/test_amin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(amin, samin){ + blasint N = 3, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(samin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_DOUBLE +CTEST(amin, damin){ + blasint N = 3, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(damin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX +CTEST(amin, scamin){ + blasint N = 9, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(scamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amin, dzamin){ + blasint N = 9, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(dzamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif From a79d11740580db13f101b76c81a02f93654de9a7 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 11:03:56 +0800 Subject: [PATCH 157/191] LoogArch64: Fixed bug for {s/d}amin --- kernel/loongarch64/amin_lasx.S | 1 - kernel/loongarch64/amin_lsx.S | 1 - 2 files changed, 2 deletions(-) diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S index 0a4359002..c91a33006 100644 --- a/kernel/loongarch64/amin_lasx.S +++ b/kernel/loongarch64/amin_lasx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else xvldrepl.w VM0, X, 0 #endif - XVFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 4 diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S index 644caf43c..c3c3f4ae9 100644 --- a/kernel/loongarch64/amin_lsx.S +++ b/kernel/loongarch64/amin_lsx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else vldrepl.w VM0, X, 0 #endif - VFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 3 From 3d4dfd008556b5a722162def487e0553f807e6e8 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 11:25:59 +0800 Subject: [PATCH 158/191] Benchmark: Rename the executable file names for {sc/dz}a{min/max} No interface named {c/z}a{min/max}, keeping it would cause ambiguity --- benchmark/Makefile | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index d9ddb9042..6a7c54636 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -265,9 +265,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ ismax.goto idmax.goto \ isamin.goto idamin.goto icamin.goto izamin.goto \ ismin.goto idmin.goto \ - samax.goto damax.goto camax.goto zamax.goto \ + samax.goto damax.goto scamax.goto dzamax.goto \ smax.goto dmax.goto \ - samin.goto damin.goto camin.goto zamin.goto \ + samin.goto damin.goto scamin.goto dzamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) @@ -2832,12 +2832,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME) damax.goto : damax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMAX ############################################## -camax.goto : camax.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMAX ############################################## +scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMAX ############################################## -zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMAX ############################################## +dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMAX ############################################## @@ -2856,12 +2856,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME) damin.goto : damin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMIN ############################################## +scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMIN ############################################## +dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMIN ############################################## @@ -3383,10 +3383,10 @@ samax.$(SUFFIX) : amax.c damax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camax.$(SUFFIX) : amax.c +scamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamax.$(SUFFIX) : amax.c +dzamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ @@ -3403,10 +3403,10 @@ samin.$(SUFFIX) : amin.c damin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camin.$(SUFFIX) : amin.c +scamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamin.$(SUFFIX) : amin.c +dzamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ From 83ce97a4ca44c1aedc9f825bcb11f3a999f09c60 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 16:54:14 +0800 Subject: [PATCH 159/191] LoongArch64: Handle NAN and INF --- kernel/loongarch64/cscal_lasx.S | 149 +------------------------------- kernel/loongarch64/cscal_lsx.S | 130 +--------------------------- 2 files changed, 4 insertions(+), 275 deletions(-) diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index 3605a6c0e..f53526663 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L113 //alpha_r != 0.0 && alpha_i == 0.0 .L14: - bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 b .L111 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: //alpha_r == 0.0 && alpha_i != 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAI, x2 - xvfsub.d x3, VXZ, x3 - xvfmul.d x4, VXAI, x1 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAI, x2 - xvfsub.s x3, VXZ, x3 - xvfmul.s x4, VXAI, x1 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - .L113: //alpha_r != 0.0 && alpha_i == 0.0 xvld VX0, X, 0 * SIZE #ifdef DOUBLE @@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L223 //alpha_r != 0.0 && alpha_i == 0.0 .L24: - bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 b .L221 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L222: //alpha_r == 0.0 && alpha_i != 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAI, x2 - xvfsub.d x3, VXZ, x3 - xvfmul.d x4, VXAI, x1 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAI, x2 - xvfsub.s x3, VXZ, x3 - xvfmul.s x4, VXAI, x1 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L222 - b .L997 - .align 3 - .L223: //alpha_r != 0.0 && alpha_i == 0.0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S index f442a754f..241d3d16e 100644 --- a/kernel/loongarch64/cscal_lsx.S +++ b/kernel/loongarch64/cscal_lsx.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L113 //alpha_r != 0.0 && alpha_i == 0.0 .L14: - bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 b .L111 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: //alpha_r == 0.0 && alpha_i != 0.0 - vld VX0, X, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE -#else - vld VX1, X, 4 * SIZE - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, VXAI, x2 - vfsub.s x3, VXZ, x3 - vfmul.s x4, VXAI, x1 - vilvl.w VX2, x4 ,x3 - vilvh.w VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - .L113: //alpha_r != 0.0 && alpha_i == 0.0 vld VX0, X, 0 * SIZE #ifdef DOUBLE @@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L223 //alpha_r != 0.0 && alpha_i == 0.0 .L24: - bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 b .L221 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L222: //alpha_r == 0.0 && alpha_i != 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - addi.d I, I, -1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.w x1, t1, 0 - vinsgr2vr.w x2, t2, 0 - vinsgr2vr.w x1, t3, 1 - vinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - vinsgr2vr.w x1, t1, 2 - vinsgr2vr.w x2, t2, 2 - vinsgr2vr.w x1, t3, 3 - vinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - - vfmul.s x3, VXAI, x2 - vfsub.s x3, VXZ, x3 - vfmul.s x4, VXAI, x1 - addi.d I, I, -1 - vstelm.w x3, XX, 0 * SIZE, 0 - vstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 1 - vstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 2 - vstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 3 - vstelm.w x4, XX, 1 * SIZE, 3 -#endif - add.d XX, XX, INCX - blt $r0, I, .L222 - b .L997 - .align 3 - .L223: //alpha_r != 0.0 && alpha_i == 0.0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE From bb043a021f138a3915c835776fdfe90673644db4 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 17:27:59 +0800 Subject: [PATCH 160/191] utest: Add tests for zscal --- utest/test_zscal.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/utest/test_zscal.c b/utest/test_zscal.c index 8992eee90..ffc851e8b 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -20,6 +20,18 @@ CTEST(zscal, i_nan) ASSERT_TRUE(isnan(nan[17])); } +CTEST(zscal, i_nan_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, + NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 2); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + CTEST(zscal, nan_i) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; @@ -30,7 +42,19 @@ CTEST(zscal, nan_i) ASSERT_TRUE(isnan(i[16])); ASSERT_TRUE(isnan(i[17])); } - + +CTEST(zscal, nan_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + CTEST(zscal, i_inf) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; @@ -40,7 +64,19 @@ CTEST(zscal, i_inf) ASSERT_TRUE(isinf(inf[1])); ASSERT_TRUE(isnan(inf[16])); ASSERT_TRUE(isinf(inf[17])); -} +} + +CTEST(zscal, i_inf_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, + INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 2); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} CTEST(zscal, inf_i) { @@ -53,4 +89,16 @@ CTEST(zscal, inf_i) ASSERT_TRUE(isinf(i[17])); } +CTEST(zscal, inf_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + #endif From 09bb48d1b97aaf60ea40707658bf6761a9637424 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 30 Jan 2024 09:13:16 -0600 Subject: [PATCH 161/191] Vectorize in-copy packing/copying for SGEMM - 4X faster. --- kernel/power/KERNEL.POWER10 | 2 +- kernel/power/KERNEL.POWER8 | 7 +- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/sgemm_ncopy_16_power.c | 482 ++++++++++++++++++++++++++++ 4 files changed, 487 insertions(+), 6 deletions(-) create mode 100755 kernel/power/sgemm_ncopy_16_power.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 9047c714c..c84cd91d2 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -25,7 +25,7 @@ ZTRMMKERNEL = zgemm_kernel_power10.S endif SGEMMKERNEL = sgemm_kernel_power10.c -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 2b8e65948..36222348a 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,11 +1,9 @@ # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) SGEMMKERNEL = gemm_kernel_power6.S -SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -50,11 +48,9 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -90,6 +86,9 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif +SGEMMINCOPY = sgemm_ncopy_16_power.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) + STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index b6b102b3e..7d007d1a2 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -13,7 +13,7 @@ CTRMMKERNEL = cgemm_kernel_power9.S ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/sgemm_ncopy_16_power.c b/kernel/power/sgemm_ncopy_16_power.c new file mode 100755 index 000000000..babe1376e --- /dev/null +++ b/kernel/power/sgemm_ncopy_16_power.c @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp19 ; + IFLOAT ctemp21, ctemp23 ; + IFLOAT ctemp25, ctemp27 ; + IFLOAT ctemp29, ctemp31 ; + + aoffset = a; + boffset = b; + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float c9, c10, c11, c12, c13, c14, c15, c16; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + vector float t9, t10, t11, t12; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + c9 = vec_xl(0, aoffset9); + c10 = vec_xl(0, aoffset10); + c11 = vec_xl(0, aoffset11); + c12 = vec_xl(0, aoffset12); + c13 = vec_xl(0, aoffset13); + c14 = vec_xl(0, aoffset14); + c15 = vec_xl(0, aoffset15); + c16 = vec_xl(0, aoffset16); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + t9 = vec_mergeh(c9, c10); + t10 = vec_mergeh(c11, c12); + t11 = vec_mergeh(c13, c14); + t12 = vec_mergeh(c15, c16); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + t9 = vec_mergel(c9, c10); + t10 = vec_mergel(c11, c12); + t11 = vec_mergel(c13, c14); + t12 = vec_mergel(c15, c16); + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + vec_xst(t5, 0, boffset+32); + vec_xst(t6, 0, boffset+36); + vec_xst(t7, 0, boffset+40); + vec_xst(t8, 0, boffset+44); + + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+48); + vec_xst(t6, 0, boffset+52); + vec_xst(t7, 0, boffset+56); + vec_xst(t8, 0, boffset+60); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + aoffset9 += 4; + aoffset10 += 4; + aoffset11 += 4; + aoffset12 += 4; + aoffset13 += 4; + aoffset14 += 4; + aoffset15 += 4; + aoffset16 += 4; + boffset += 64; + + i --; + }while(i > 0); + } + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + aoffset9+=1; + aoffset10+=1; + aoffset11+=1; + aoffset12+=1; + aoffset13+=1; + aoffset14+=1; + aoffset15+=1; + aoffset16+=1; + boffset += 16; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + boffset += 32; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + + boffset += 8; + i--; + } while (i > 0); + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4; + vector float t1, t2, t3, t4; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset); + vec_xst(t4, 0, boffset+4); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset+8); + vec_xst(t4, 0, boffset+12); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset += 16; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + + boffset += 4; + i--; + } while (i > 0); + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} From 969601a1dcfdc4c44174346b7c752fa338f00737 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 31 Jan 2024 11:20:25 +0800 Subject: [PATCH 162/191] X86_64: Fixed bug in zscal Fixed handling of NAN and INF arguments when inc is greater than 1. --- kernel/x86_64/zscal.c | 91 ++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 66c8a0d2b..bc79c0caf 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) for( i=0; i FLT_MAX) { + else if (da_r < -FLT_MAX || da_r > FLT_MAX) { while(j < n) { x[i]= NAN; @@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (x[i] < -FLT_MAX || x[i] > FLT_MAX) temp0 = NAN; x[i+1] = da_i * x[i]; - if ( x[i] == x[i]) //preserve NaN + if ( x[i] == x[i]) //preserve NaN x[i] = temp0; i += 2 ; j++; @@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { while(j < n) { - + temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; @@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } - } + } } From 1a6fdb035308370c08c740da279b769615594980 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 15:57:57 +0100 Subject: [PATCH 163/191] Add prototypes for extensions ?AMIN/?AMAX and CAXPYC/ZAXPYC --- cblas.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cblas.h b/cblas.h index ade2fca3a..3b74e25ee 100644 --- a/cblas.h +++ b/cblas.h @@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + +float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); @@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); + void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); @@ -290,7 +303,6 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, From b54cda849096ade35bd2f69341e3d02fa1543512 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 16:00:52 +0100 Subject: [PATCH 164/191] Unify creation of CBLAS interfaces for ?AMIN/?AMAX and C/ZAXPYC between gmake and cmake builds --- interface/CMakeLists.txt | 2 ++ interface/Makefile | 49 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 4e082928b..ed19b556a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -130,6 +130,8 @@ endif () foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) diff --git a/interface/Makefile b/interface/Makefile index 78335357b..99859cbf5 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ + cblas_samin.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ + cblas_damin.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) \ + cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ @@ -340,12 +342,12 @@ CXERBLAOBJ = \ CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ - cblas_zcopy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) \ + cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) +cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) From a7d004e820f1ccbc9f61b4b1353ccdb04f208690 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 17:55:42 +0100 Subject: [PATCH 165/191] Fix CBLAS prototype --- interface/max.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/interface/max.c b/interface/max.c index f05977448..6c7d32bd9 100644 --- a/interface/max.c +++ b/interface/max.c @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else +#ifdef COMPLEX +FLOAT CNAME(blasint n, void *vx, blasint incx){ + FLOAT *x = (FLOAT*) vx; +#else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - +#endif + FLOAT ret; PRINT_DEBUG_CNAME; From 47bd06476312598eea694f19a00a9191041b1586 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 20:49:43 +0100 Subject: [PATCH 166/191] Fix names in build rules --- interface/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 99859cbf5..ad4a0fb89 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1541,10 +1541,10 @@ cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) -cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c +cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) -cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c +cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c @@ -1553,10 +1553,10 @@ cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) -cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c +cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) -cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c +cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c From 61c8e19f95dfefe3e586d6d83f7432b7d51299fa Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 31 Jan 2024 15:27:50 -0600 Subject: [PATCH 167/191] Fix Makefile to support OpenMP on AIX for xlc (clang) with xlf. --- ctest/Makefile | 3 +++ test/Makefile | 3 +++ utest/Makefile | 10 ++++++++++ 3 files changed, 16 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index af5b34a36..ad960b35a 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) CEXTRALIB += -lgomp endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif endif endif diff --git a/test/Makefile b/test/Makefile index 56acf1c5b..5a4694ce6 100644 --- a/test/Makefile +++ b/test/Makefile @@ -276,6 +276,9 @@ ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) CEXTRALIB += -lgomp endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif endif endif diff --git a/utest/Makefile b/utest/Makefile index 8acaa3ea9..ac58d6f12 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -45,8 +45,18 @@ endif all : run_test +ifeq ($(OSNAME), AIX) +ifeq ($(USE_OPENMP), 1) +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) +else +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif +else $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif run_test: $(UTESTBIN) ifneq ($(CROSS), 1) From a3b0ef6596d51ecfb59b0a2f6a7b0d59bc4f18b4 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Thu, 1 Feb 2024 10:26:02 +0000 Subject: [PATCH 168/191] Restore riscv64 fixes from develop branch: dot product double precision accumulation, zscal NaN handling --- Makefile.prebuild | 1 + kernel/riscv64/dot.c | 10 ++++ kernel/riscv64/zscal_rvv.c | 90 ++++++----------------------------- kernel/riscv64/zscal_vector.c | 79 +----------------------------- 4 files changed, 26 insertions(+), 154 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index b44b50039..b7d695a75 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -57,6 +57,7 @@ endif ifeq ($(TARGET), CK860FV) TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index bf55998ca..8ad493a2b 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + +#if defined(DSDOT) double dot = 0.0 ; +#else + FLOAT dot = 0.0 ; +#endif if ( n < 1 ) return(dot); while(i < n) { +#if defined(DSDOT) + dot += (double) y[iy] * (double) x[ix] ; +#else dot += y[iy] * x[ix] ; +#endif + ix += inc_x ; iy += inc_y ; i++ ; diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 2586c6036..ae79d9f9d 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -69,49 +69,26 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F size_t vlmax = VSETVL_MAX; FLOAT_VX2_T vx2; - if(da_r == 0.0 && da_i == 0.0) { + if(inc_x == 1) { - vr = VFMVVF_FLOAT(0.0, vlmax); - vi = VFMVVF_FLOAT(0.0, vlmax); - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - } - - } else if(da_r == 0.0) { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx2 = VLSEG_FLOAT(x, vl); vr = VGET_VX2(vx2, 0); vi = VGET_VX2(vx2, 1); - vt = VFMULVF_FLOAT(vi, -da_i, vl); - vi = VFMULVF_FLOAT(vr, da_i, vl); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); - - VSSSEG_FLOAT(x, stride_x, vx2, vl); + VSSEG_FLOAT(x, vx2, vl); } - } else if(da_i == 0.0) { + } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); @@ -120,54 +97,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vr = VGET_VX2(vx2, 0); vi = VGET_VX2(vx2, 1); - vr = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); VSSSEG_FLOAT(x, stride_x, vx2, vl); } - - } else { - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - - vx2 = VLSEG_FLOAT(x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - } } return(0); diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 2034aafaa..536bbdf73 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(0.0, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - }else if(da_r == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else if(da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else{ + { gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; From 2bb7ea64a197c04ce36927530394db5400242d20 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 1 Feb 2024 08:11:43 -0600 Subject: [PATCH 169/191] Only vectorize 64-bit version for Power8. --- kernel/power/KERNEL.POWER8 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 36222348a..700a68e44 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,9 +1,11 @@ # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -48,9 +50,11 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -86,9 +90,6 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -SGEMMINCOPY = sgemm_ncopy_16_power.c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) - STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c From 4d8dee508ce415743d61042111a9d50660b2b8bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 01:05:03 +0100 Subject: [PATCH 170/191] temporarily disable the CAXPY/ZAXPY kernels --- kernel/riscv64/KERNEL.C910V | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 2798a870e..066329390 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c SAXPYKERNEL = axpy_vector.c DAXPYKERNEL = axpy_vector.c -CAXPYKERNEL = zaxpy_vector.c -ZAXPYKERNEL = zaxpy_vector.c +CAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy.c SAXPBYKERNEL = axpby_vector.c DAXPBYKERNEL = axpby_vector.c From 68d354814f9f846338e1988c4f609c8add419012 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 01:14:22 +0100 Subject: [PATCH 171/191] Fix incompatible pointer type in BFLOAT16 mode --- interface/gemmt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index 01dec0c35..8fd8089d0 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -527,7 +527,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -616,7 +616,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -666,4 +666,4 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_END; return; -} \ No newline at end of file +} From 3597827c932b90be8916f5fbaa27dd8db818ee9e Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 1 Feb 2024 16:33:58 +0800 Subject: [PATCH 172/191] utest: add axpby --- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_axpby.c | 320 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 utest/test_axpby.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 41829bd22..edfcfb7cf 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -17,6 +17,7 @@ else () test_swap.c test_zscal.c test_amin.c + test_axpby.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index 8acaa3ea9..867f8c88c 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -12,7 +12,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ - test_amin.o + test_amin.o test_axpby.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_axpby.c b/utest/test_axpby.c new file mode 100644 index 000000000..37ba8ad14 --- /dev/null +++ b/utest/test_axpby.c @@ -0,0 +1,320 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(axpby, saxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha = 1.0, beta = 2.0; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_DOUBLE +CTEST(axpby, daxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha = 1.0, beta = 2.0; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX +CTEST(axpby, caxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX16 +CTEST(axpby, zaxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif From 1e1f487dc7e95c7e5a1d4234147439a5f4bca070 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 1 Feb 2024 19:57:05 +0800 Subject: [PATCH 173/191] LoongArch64: Fixed {s/d}axpby --- kernel/loongarch64/axpby_lasx.S | 9 +++++++-- kernel/loongarch64/axpby_lsx.S | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S index f1d99cd3b..7a246ca5c 100644 --- a/kernel/loongarch64/axpby_lasx.S +++ b/kernel/loongarch64/axpby_lasx.S @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE bge $r0, N, .L999 - li.d TEMP, 1 movgr2fr.d a1, $r0 ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT MTG t1, ALPHA @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvreplgr2vr.w VXB, t2 xvreplgr2vr.w VXZ, t3 #endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S index 45154c262..e50d4cdcc 100644 --- a/kernel/loongarch64/axpby_lsx.S +++ b/kernel/loongarch64/axpby_lsx.S @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE bge $r0, N, .L999 - li.d TEMP, 1 movgr2fr.d a1, $r0 ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT MTG t1, ALPHA @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vreplgr2vr.w VXB, t2 vreplgr2vr.w VXZ, t3 #endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 From 7bc93d95a1d72fe733e43aa1bea64796b123dcbb Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 24 Jan 2024 16:11:45 +0800 Subject: [PATCH 174/191] LoongArch64: Opt {c/z}axpby --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/caxpby_lasx.S | 1046 ++++++++++++++++++++++ kernel/loongarch64/caxpby_lsx.S | 1029 +++++++++++++++++++++ 4 files changed, 2079 insertions(+) create mode 100644 kernel/loongarch64/caxpby_lasx.S create mode 100644 kernel/loongarch64/caxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index e27ce3bee..f4ab495e6 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lsx.S SAXPBYKERNEL = axpby_lsx.S DAXPBYKERNEL = axpby_lsx.S +CAXPBYKERNEL = caxpby_lsx.S +ZAXPBYKERNEL = caxpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f4429cfba..bd85fab01 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lasx.S SAXPBYKERNEL = axpby_lasx.S DAXPBYKERNEL = axpby_lasx.S +CAXPBYKERNEL = caxpby_lasx.S +ZAXPBYKERNEL = caxpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S diff --git a/kernel/loongarch64/caxpby_lasx.S b/kernel/loongarch64/caxpby_lasx.S new file mode 100644 index 000000000..c5802092e --- /dev/null +++ b/kernel/loongarch64/caxpby_lasx.S @@ -0,0 +1,1046 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXBR $xr14 +#define VXBI $xr13 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI + MTG t3, BETAR + MTG t4, BETAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + xvreplgr2vr.d VXBR, t3 + xvreplgr2vr.d VXBI, t4 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + xvreplgr2vr.w VXBR, t3 + xvreplgr2vr.w VXBI, t4 +#endif + xvxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + xvld VX1, X, 8 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + add.d Y, Y, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + xvld VX3, Y, 8 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvfmul.d x3, VXAI, x2 + xvfmul.d x4, VXAI, x1 + xvfmsub.d x3, VXAR, x1, x3 + xvfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d Y, Y, INCY + xvfmul.d x3, VXBI, x2 + xvfmul.d x4, VXBI, x1 + xvfmsub.d x3, VXBR, x1, x3 + xvfmadd.d x4, VXBR, x2, x4 + + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d Y, Y, INCY + + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MUL s3, BETAI, a4 + MUL s4, BETAI, a3 + MSUB s1, ALPHAR, a1, s1 + MADD s2, a2, ALPHAR, s2 + MSUB s3, BETAR, a3, s3 + MADD s4, a4, BETAR, s4 + ADD s3, s3, s1 + ADD s4, s4, s2 + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpby_lsx.S b/kernel/loongarch64/caxpby_lsx.S new file mode 100644 index 000000000..247ae428e --- /dev/null +++ b/kernel/loongarch64/caxpby_lsx.S @@ -0,0 +1,1029 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXBR $vr14 +#define VXBI $vr13 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 +#ifdef DOUBLE + ffint.d.l a1, a1 +#else + ffint.s.l a1, a1 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, ALPHAR + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 + movfr2gr.d t3, BETAR + vreplgr2vr.d VXBR, t3 + movfr2gr.d t4, BETAI + vreplgr2vr.d VXBI, t4 +#else + movfr2gr.s t1, ALPHAR + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 + movfr2gr.s t3, BETAR + vreplgr2vr.w VXBR, t3 + movfr2gr.s t4, BETAI + vreplgr2vr.w VXBI, t4 +#endif + vxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#else + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#endif + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#endif + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, Y, 4 * SIZE + vld VX1, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#else + vld VX0, Y, 0 * SIZE + vld VX1, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#endif + +.L114: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#endif + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d Y, Y, INCY + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: +#ifdef DOUBLE + fld.d a1, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fld.d a3, Y, 0 * SIZE + fld.d a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.d s1, ALPHAI, a2 + fmul.d s2, ALPHAI, a1 + fmul.d s3, BETAI, a4 + fmul.d s4, BETAI, a3 + fmsub.d s1, ALPHAR, a1, s1 + fmadd.d s2, a2, ALPHAR, s2 + fmsub.d s3, BETAR, a3, s3 + fmadd.d s4, a4, BETAR, s4 + fadd.d s3, s3, s1 + fadd.d s4, s4, s2 + fst.d s3, Y, 0 * SIZE + fst.d s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#else + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + fld.s a3, Y, 0 * SIZE + fld.s a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.s s1, ALPHAI, a2 + fmul.s s2, ALPHAI, a1 + fmul.s s3, BETAI, a4 + fmul.s s4, BETAI, a3 + fmsub.s s1, ALPHAR, a1, s1 + fmadd.s s2, a2, ALPHAR, s2 + fmsub.s s3, BETAR, a3, s3 + fmadd.s s4, a4, BETAR, s4 + fadd.s s3, s3, s1 + fadd.s s4, s4, s2 + fst.s s3, Y, 0 * SIZE + fst.s s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#endif +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From adde7253217a6824da1deff064b46e36c1e259ca Mon Sep 17 00:00:00 2001 From: gxw Date: Sun, 4 Feb 2024 14:43:08 +0800 Subject: [PATCH 175/191] LoongArch64: Fixed {s/d}amin LSX optimization --- kernel/loongarch64/amin_lsx.S | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S index c3c3f4ae9..47701b6e4 100644 --- a/kernel/loongarch64/amin_lsx.S +++ b/kernel/loongarch64/amin_lsx.S @@ -124,7 +124,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13: FABS $f0, $f0 - SUB $f0, $f0, $f0 jirl $r0, $r1, 0x0 .align 3 From d02c61e82e82b7f721ecf50dd7717547b6302a19 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 10:01:27 +0100 Subject: [PATCH 176/191] Update lowercase cpunames for RISC-V --- cpuid_riscv64.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 5db54f1aa..ff7ba2aad 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -86,7 +86,10 @@ static char *cpuname[] = { static char *cpuname_lower[] = { "riscv64_generic", - "c910v" + "c910v", + "x280", + "riscv64_zvl256b", + "riscv64_zvl128b" }; int detect(void){ From e61d96303d4972181e1d1b26b1f3a88ed9e5de02 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 10:05:20 +0100 Subject: [PATCH 177/191] Fix missing NO_AVX2 fallback for SapphireRapids --- driver/others/dynamic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 69a473060..e3f905265 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR; #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE +#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; From 6d8a273cca96c16873a15b59ea724834aa9b4558 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 22:07:51 +0100 Subject: [PATCH 178/191] Handle zero increment(s) in C910V ?AXPBY (#4483) * Handle zero increment(s) --- kernel/riscv64/axpby_vector.c | 58 +++++++++++++++++++++++++ kernel/riscv64/zaxpby_vector.c | 77 ++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 850fc903e..721aad2b0 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -69,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_x, stride_y, ix = 0, iy = 0; + if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */ + + if ( beta == 0.0 ) + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = 0.0 ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + + + } + + } + else + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = beta * y[iy] ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] + beta * y[iy] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + + } + + return(0); + + } else { /* vectorized approach for non-zero increments */ + if(beta == 0.0){ if(alpha == 0.0){//alpha == 0 && beta == 0 if(inc_y == 1){ @@ -381,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } } return(0); + } } diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index d5ad974cf..bbf2bbe7d 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -62,6 +62,82 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL stride_x = inc_x * 2 * sizeof(FLOAT); stride_y = inc_y * 2 * sizeof(FLOAT); + if (inc_x == 0 || inc_y == 0) { + + FLOAT temp; + BLASLONG inc_x2, inc_y2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + y[iy] = 0.0 ; + y[iy+1] = 0.0 ; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + } + else + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + + + } + return(0); + + } else { + if(beta_r == 0.0 && beta_i == 0.0){ if(alpha_r == 0.0 && alpha_i == 0.0){ if(inc_y == 1){ @@ -191,5 +267,6 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL } } return(0); + } } From 479e4af0893f3772eff9a75e89cf28bef91383c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 5 Feb 2024 15:35:24 +0100 Subject: [PATCH 179/191] Rescale input vector more often to minimize relative error (Reference-LAPACK PR 981) --- lapack-netlib/SRC/clarfgp.f | 30 ++++++++++-------------------- lapack-netlib/SRC/zlarfgp.f | 30 ++++++++++-------------------- 2 files changed, 20 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f index 47b5e47b0..980e93612 100644 --- a/lapack-netlib/SRC/clarfgp.f +++ b/lapack-netlib/SRC/clarfgp.f @@ -148,33 +148,23 @@ ALPHR = REAL( ALPHA ) ALPHI = AIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = SLAPY2( ALPHR, ALPHI ) - TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f index 6c9efb04c..d54f2ea5d 100644 --- a/lapack-netlib/SRC/zlarfgp.f +++ b/lapack-netlib/SRC/zlarfgp.f @@ -148,33 +148,23 @@ ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = DLAPY2( ALPHR, ALPHI ) - TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * From fe3da43b7dac7af1be5538f87f707a6073fbc52c Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 6 Feb 2024 11:49:01 +0800 Subject: [PATCH 180/191] Optimized zgemm kernel 8*4 LASX, 4*4 LSX and cgemm kernel 8*4 LSX for LoongArch --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 20 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 10 +- kernel/loongarch64/cgemm_kernel_8x4_lsx.S | 3313 ++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_4_lsx.S | 341 ++ kernel/loongarch64/cgemm_ncopy_8_lsx.S | 263 ++ kernel/loongarch64/cgemm_tcopy_4_lsx.S | 324 ++ kernel/loongarch64/cgemm_tcopy_8_lsx.S | 277 ++ kernel/loongarch64/zgemm_kernel_4x4_lsx.S | 2316 +++++++++++++ kernel/loongarch64/zgemm_kernel_8x4_lasx.S | 3545 ++++++++++++++++++++ kernel/loongarch64/zgemm_ncopy_4_lasx.S | 320 ++ kernel/loongarch64/zgemm_ncopy_4_lsx.S | 332 ++ kernel/loongarch64/zgemm_ncopy_8_lasx.S | 263 ++ kernel/loongarch64/zgemm_tcopy_4_lasx.S | 302 ++ kernel/loongarch64/zgemm_tcopy_4_lsx.S | 355 ++ kernel/loongarch64/zgemm_tcopy_8_lasx.S | 268 ++ param.h | 10 +- 16 files changed, 12248 insertions(+), 11 deletions(-) create mode 100644 kernel/loongarch64/cgemm_kernel_8x4_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_8_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_8_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_4x4_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_8x4_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_8_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_8_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index f4ab495e6..c7ef44035 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -100,9 +100,13 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMMKERNEL = cgemm_kernel_2x2_lsx.S -CGEMMONCOPY = cgemm_ncopy_2_lsx.S -CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMKERNEL = cgemm_kernel_8x4_lsx.S +CGEMMINCOPY = cgemm_ncopy_8_lsx.S +CGEMMITCOPY = cgemm_tcopy_8_lsx.S +CGEMMONCOPY = cgemm_ncopy_4_lsx.S +CGEMMOTCOPY = cgemm_tcopy_4_lsx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -111,4 +115,14 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S +ZGEMMONCOPY = zgemm_ncopy_4_lsx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index bd85fab01..17d15656a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -122,9 +122,13 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S -ZGEMMONCOPY = zgemm_ncopy_2_lasx.S -ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S +ZGEMMINCOPY = zgemm_ncopy_8_lasx.S +ZGEMMITCOPY = zgemm_tcopy_8_lasx.S +ZGEMMONCOPY = zgemm_ncopy_4_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S new file mode 100644 index 000000000..1e9fd8524 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S @@ -0,0 +1,3313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.w VALPHAR, $sp, 112 + vldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vand.v D0, U1, U1 + + vpermi.w D0, U0, 0x44 + vpermi.w U1, U0, 0xee + + vst D0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..87a88e37d --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + fld.s F0, S5, 0x00 + fld.s F1, S5, 0x04 + fld.s F2, S6, 0x00 + fld.s F3, S6, 0x04 + fld.s F4, S7, 0x00 + fld.s F5, S7, 0x04 + fld.s F6, S8, 0x00 + fld.s F7, S8, 0x04 + + fst.s F0, TD, 0x20 + fst.s F1, TD, 0x24 + fst.s F2, TD, 0x28 + fst.s F3, TD, 0x2c + fst.s F4, TD, 0x30 + fst.s F5, TD, 0x34 + fst.s F6, TD, 0x38 + fst.s F7, TD, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..6d63d62e7 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S @@ -0,0 +1,324 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + + vst U0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..2935bbc07 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S @@ -0,0 +1,277 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + vst U4, TD, 0x40 + vst U5, TD, 0x50 + vst U6, TD, 0x60 + vst U7, TD, 0x70 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x10 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d TD, TD, 0x10 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S new file mode 100644 index 000000000..6c4841b24 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.d VALPHAR, $sp, 112 + vldrepl.d VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvand.v D0, U0, U0 + + xvpermi.q D0, U1, 0x02 + xvpermi.q U1, U0, 0x31 + + xvst D0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + xvld U0, S1, 0x00 + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld $vr0, S1, 0x00 + vst $vr0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..203471cbd --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U8, TD, 0x20 + vst U12, TD, 0x30 + + vst U1, TD, 0x40 + vst U5, TD, 0x50 + vst U9, TD, 0x60 + vst U13, TD, 0x70 + + vst U2, TD, 0x80 + vst U6, TD, 0x90 + vst U10, TD, 0xa0 + vst U14, TD, 0xb0 + + vst U3, TD, 0xc0 + vst U7, TD, 0xd0 + vst U11, TD, 0xe0 + vst U15, TD, 0xf0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U4, TD, 0x20 + vst U6, TD, 0x30 + + vst U1, TD, 0x40 + vst U3, TD, 0x50 + vst U5, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U1, TD, 0x20 + vst U5, TD, 0x30 + + vst U2, TD, 0x40 + vst U6, TD, 0x50 + vst U3, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U1, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..7cd8f605b --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S5, 0x00 + fld.d F1, S5, 0x08 + fld.d F2, S6, 0x00 + fld.d F3, S6, 0x08 + fld.d F4, S7, 0x00 + fld.d F5, S7, 0x08 + fld.d F6, S8, 0x00 + fld.d F7, S8, 0x08 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + + addi.d S1, S1, 0x10 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..1adee11c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + xvst U4, S8, 0x80 + xvst U5, S8, 0xa0 + xvst U6, S8, 0xc0 + xvst U7, S8, 0xe0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + xvst U2, S9, 0x40 + xvst U3, S9, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + vst $vr2, S10, 0x20 + vst $vr3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + xvld U0, S1, 0x00 + + xvst U0, S9, 0x00 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, S1, 0x00 + + vst $vr0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..954753eaf --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S @@ -0,0 +1,355 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + vst U8, S8, 0x80 + vst U9, S8, 0x90 + vst U10, S8, 0xa0 + vst U11, S8, 0xb0 + vst U12, S8, 0xc0 + vst U13, S8, 0xd0 + vst U14, S8, 0xe0 + vst U15, S8, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + vst U4, S9, 0x40 + vst U5, S9, 0x50 + vst U6, S9, 0x60 + vst U7, S9, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + vst U2, S10, 0x20 + vst U3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld U0, S1, 0x00 + + vst U0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..f7440dc24 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x80 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + xvst U4, TD, 0x80 + xvst U5, TD, 0xa0 + xvst U6, TD, 0xc0 + xvst U7, TD, 0xe0 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x100 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + addi.d TD, TD, 0x80 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TD, TD, 0x40 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d TD, TD, 0x20 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index 003ea396f..5d2e960a2 100644 --- a/param.h +++ b/param.h @@ -2854,12 +2854,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 @@ -2891,10 +2891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 From d4db6a9f16a5c82bbe1860f591cc731c4d83d7c8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Feb 2024 22:23:47 +0100 Subject: [PATCH 181/191] Separate the interface for SBGEMMT from GEMMT due to differences in GEMV arguments --- interface/CMakeLists.txt | 1 + interface/Makefile | 8 +- interface/gemmt.c | 3 +- interface/sbgemmt.c | 447 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 452 insertions(+), 7 deletions(-) create mode 100644 interface/sbgemmt.c diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ed19b556a..55374674a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -119,6 +119,7 @@ endif () if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") diff --git a/interface/Makefile b/interface/Makefile index ad4a0fb89..048d679d6 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1303,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1662,10 +1662,6 @@ cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) -sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1971,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) -cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif diff --git a/interface/gemmt.c b/interface/gemmt.c index 8fd8089d0..018deb7fb 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -158,7 +158,8 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, uplo = 0; if (Uplo == 'L') uplo = 1; - + + nrowa = m; if (transa & 1) nrowa = k; nrowb = k; #if defined(COMPLEX) diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c new file mode 100644 index 000000000..759af4bfb --- /dev/null +++ b/interface/sbgemmt.c @@ -0,0 +1,447 @@ +/*********************************************************************/ +/* Copyright 2024, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#define SMP_THRESHOLD_MIN 65536.0 +#define ERROR_NAME "SBGEMMT " + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + blasint nrowa, nrowb; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; + FLOAT alpha, beta; + + PRINT_DEBUG_NAME; + + m = *M; + k = *K; + + alpha = *Alpha; + beta = *Beta; + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; + + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; + + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + info = 0; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + + if (info != 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ + IFLOAT *aa, *bb; + FLOAT *cc; + + int transa, transb, uplo; + blasint info; + blasint lda, ldb; + IFLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + uplo = -1; + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; + + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; + + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; + + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + blasint nrowa; + blasint nrowb; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + } + + if (order == CblasRowMajor) { + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; + + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; + + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; + + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; + + info = -1; + + blasint ncola; + blasint ncolb; + + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; + + if (transb & 1) { + ncolb = m; + } + + if (ldc < MAX(1,m)) + info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 2; + if (transa < 0) + info = 3; + if (uplo < 0) + info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + int buffer_size; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, + BLASLONG, IFLOAT *, BLASLONG, FLOAT, + FLOAT *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, + IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { + SBGEMV_N, SBGEMV_T,}; + + + if (m == 0) + return; + + IDEBUG_START; + + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < m; i++) { + j = m - i; + + aa = a + i; + bb = b + i * ldb; + if (transa & 1) { + aa = a + lda * i; + } + if (transb & 1) + bb = b + i; + cc = c + i * ldc + i; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, aa, lda, + bb, incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, aa, lda, + bb, incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < m; i++) { + j = i + 1; + + bb = b + i * ldb; + if (transb & 1) { + bb = b + i; + } + cc = c + i * ldc; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, a, lda, bb, + incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, a, lda, bb, + incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + } +#endif + + STACK_FREE(buffer); + } + } + + IDEBUG_END; + + return; +} From fb99fc2e6e4ec8ecdcfffe1ca1aeb787464d2825 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 7 Feb 2024 13:42:08 +0100 Subject: [PATCH 182/191] fix type conversion warnings --- test/compare_sgemm_sbgemm.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index cf808b56d..4afa8bf93 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,6 +81,16 @@ float16to32 (bfloat16_bits f16) return f32.v; } +float +float32to16 (float32_bits f32) +{ + bfloat16_bits f16; + f16.bits.s = f32.bits.s; + f16.bits.e = f32.bits.e; + f16.bits.m = (uint32_t) f32.bits.m >> 16; + return f32.v; +} + int main (int argc, char *argv[]) { @@ -108,16 +118,16 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; - BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; + AA[j * k + i].v = float32to16( A[j * k + i] ); + BB[j * k + i].v = float32to16( B[j * k + i] ); CC[j * k + i] = 0; DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, + &m, (bfloat16*)BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) From 08ce6b1c1c6468c26607353d516e0dc8c009f58e Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 7 Feb 2024 07:54:06 -0600 Subject: [PATCH 183/191] Add missing CPU ID definitions for old versions of AIX. --- driver/others/dynamic_power.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 0454f186c..16320dc40 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -43,6 +43,13 @@ char *gotoblas_corename(void) { #define CPU_POWER9 9 #define CPU_POWER10 10 +#ifndef POWER_9 +#define POWER_9 0x20000 /* 9 class CPU */ +#endif +#ifndef POWER_10 +#define POWER_10 0x40000 /* 10 class CPU */ +#endif + #ifdef _AIX #include From e9f480111e1d5b6f69c8053f79375b0a4242712f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 7 Feb 2024 19:57:18 +0100 Subject: [PATCH 184/191] fix sbgemm bfloat16 conversion errors introduced in PR 4488 --- test/compare_sgemm_sbgemm.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 4afa8bf93..bc74233ab 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,16 +81,6 @@ float16to32 (bfloat16_bits f16) return f32.v; } -float -float32to16 (float32_bits f32) -{ - bfloat16_bits f16; - f16.bits.s = f32.bits.s; - f16.bits.e = f32.bits.e; - f16.bits.m = (uint32_t) f32.bits.m >> 16; - return f32.v; -} - int main (int argc, char *argv[]) { @@ -110,6 +100,8 @@ main (int argc, char *argv[]) float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; float DD[m * n], CC[m * n]; + bfloat16 atmp,btmp; + blasint one=1; for (j = 0; j < m; j++) { @@ -118,8 +110,10 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = float32to16( A[j * k + i] ); - BB[j * k + i].v = float32to16( B[j * k + i] ); + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; CC[j * k + i] = 0; DD[j * k + i] = 0; } From b3fa16345d83b723b8984b78dc6a2bb5d9f3d479 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 13:15:34 +0100 Subject: [PATCH 185/191] fix prototype for c/zaxpby --- common_interface.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_interface.h b/common_interface.h index 61a82c306..5a2e1654c 100644 --- a/common_interface.h +++ b/common_interface.h @@ -773,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); +void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); From 500ac4de5e20596d5cd797d745db97dd0a62ff86 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 13:18:34 +0100 Subject: [PATCH 186/191] fix incompatible pointer types --- interface/zaxpby.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/interface/zaxpby.c b/interface/zaxpby.c index 3a4db7403..e5065270d 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef CBLAS -void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) +void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) { blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; + FLOAT* ALPHA = (FLOAT*) VALPHA; + FLOAT* BETA = (FLOAT*) VBETA; #else From ac6b4b7aa472799245c497ab3caa6e77c0e80852 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 8 Feb 2024 08:56:30 -0600 Subject: [PATCH 187/191] Make sure CPU ID works for all POWER_10 conditions --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 16320dc40..cd5e88922 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -69,7 +69,7 @@ static int cpuid(void) else if (arch == POWER_9) return CPU_POWER9; #endif #ifdef POWER_10 - else if (arch == POWER_10) return CPU_POWER10; + else if (arch >= POWER_10) return CPU_POWER10; #endif return CPU_UNKNOWN; } From d408ecedba8cc1d9b799b89df6996bdcd996d5fc Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 8 Feb 2024 12:17:18 -0600 Subject: [PATCH 188/191] Add environment variable to display coretype for dynamic arch. --- driver/others/dynamic_power.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index cd5e88922..4c1f4a26e 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -339,6 +339,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas && gotoblas -> init) { strncpy(coren,gotoblas_corename(),20); sprintf(coremsg, "Core: %s\n",coren); + if (getenv("GET_OPENBLAS_CORETYPE")) { + fprintf(stderr, "%s", coremsg); + } openblas_warning(2, coremsg); gotoblas -> init(); } else { From 83bec5135534c94fe1790aa5795620a7cdeaa916 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 21:23:48 +0100 Subject: [PATCH 189/191] Update with recently added CBLAS interfaces and LAPACK/LAPACKE functions --- exports/gensymbol | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/exports/gensymbol b/exports/gensymbol index 704eab06f..93edf2400 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -60,6 +60,7 @@ cblasobjsc=" cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy + cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin " cblasobjsd=" cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot @@ -69,6 +70,7 @@ cblasobjsd=" cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy + cblas_damax cblas_damin " cblasobjss=" @@ -80,6 +82,7 @@ cblasobjss=" cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy + cblas_samax cblas_samin " cblasobjsz=" @@ -91,6 +94,7 @@ cblasobjsz=" cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy + cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin " cblasobjs="cblas_xerbla" @@ -861,6 +865,51 @@ lapackobjs2z="$lapackobjs2z zgedmd zgedmdq " + +#functions added post 3.11 + +lapackobjs2c="$lapackobjs2c + claqp2rk + claqp3rk + claqz0 + claqz1 + claqz2 + claqz3 + clatrs3 + ctrsyl3 + " +lapackobjs2d="$lapackobjs2d + dgelqs + dgelst + dgeqp3rk + dgeqrs + dlaqp2rk + dlaqp3rk + dlaqz0 + dlaqz1 + dlaqz2 + dlaqz3 + dlaqz4 + dlarmm + dlatrs3 + dtrsyl3 + " +lapackobjs2z="$lapackobjs2z + zgelqs + zgelst + zgeqp3rk + zgeqrs + zlaqp2rk + zlaqp3rk + zlaqz0 + zlaqz1 + zlaqz2 + zlaqz3 + zlatrs3 + zrscl + ztrsyl3 +" + lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx @@ -1622,6 +1671,14 @@ lapackeobjsc=" LAPACKE_cgetsqrhrt_work LAPACKE_cungtsqr_row LAPACKE_cungtsqr_row_work + LAPACKE_clangb + LAPACKE_clangb_work + LAPACKE_ctrsyl3 + LAPACKE_ctrsyl3_work + LAPACKE_ctz_nancheck + LAPACKE_ctz_trans + LAPACKE_cunhr_col + LAPACKE_cunhr_col_work " lapackeobjsd=" @@ -2239,6 +2296,14 @@ lapackeobjsd=" LAPACKE_dgetsqrhrt_work LAPACKE_dorgtsqr_row LAPACKE_dorgtsqr_row_work + LAPACKE_dlangb + LAPACKE_dlangb_work + LAPACKE_dorhr_col + LAPACKE_dorhr_col_work + LAPACKE_dtrsyl3 + LAPACKE_dtrsyl3_work + LAPACKE_dtz_nancheck + LAPACKE_dtz_trans " lapackeobjss=" @@ -2848,6 +2913,14 @@ lapackeobjss=" LAPACKE_sgetsqrhrt_work LAPACKE_sorgtsqr_row LAPACKE_sorgtsqr_row_work + LAPACKE_slangb + LAPACKE_slangb_work + LAPACKE_sorhr_col + LAPACKE_sorhr_col_work + LAPACKE_strsyl3 + LAPACKE_strsyl3_work + LAPACKE_stz_nancheck + LAPACKE_stz_trans " lapackeobjsz=" @@ -3515,6 +3588,14 @@ lapackeobjsz=" LAPACKE_zgetsqrhrt_work LAPACKE_zungtsqr_row LAPACKE_zungtsqr_row_work + LAPACKE_zlangb + LAPACKE_zlangb_work + LAPACKE_ztrsyl3 + LAPACKE_ztrsyl3_work + LAPACKE_ztz_nancheck + LAPACKE_ztz_trans + LAPACKE_zunhr_col + LAPACKE_zunhr_col_work " ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3616,6 +3697,7 @@ lapack_embeded_underscore_objs_s=" ssysv_aa_2stage ssytrf_aa_2stage ssytrs_aa_2stage slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col + slarfb_gett " lapack_embeded_underscore_objs_c=" chetf2_rook chetrf_rook chetri_rook @@ -3641,6 +3723,7 @@ lapack_embeded_underscore_objs_c=" csysv_aa_2stage csytrf_aa_2stage csytrs_aa_2stage claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col + clarfb_gett " lapack_embeded_underscore_objs_d=" dlasyf_rook @@ -3658,6 +3741,7 @@ lapack_embeded_underscore_objs_d=" dsysv_aa_2stage dsytrf_aa_2stage dsytrs_aa_2stage dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col + dlarfb_gett " lapack_embeded_underscore_objs_z=" zhetf2_rook zhetrf_rook zhetri_rook @@ -3682,6 +3766,7 @@ lapack_embeded_underscore_objs_z=" zhetrs_aa_2stage zsysv_aa_2stage zsytrf_aa_2stage zsytrs_aa_2stage zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col + zlarfb_gett " dirname=`pwd -P`/../lapack-netlib From 93872f4681430a6ae9d857d8044d36ba98d09bf2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 23:02:09 +0100 Subject: [PATCH 190/191] drop the ?laqz? symbols for now (not translatable by f2c) --- exports/gensymbol | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 93edf2400..226035842 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -871,13 +871,14 @@ lapackobjs2z="$lapackobjs2z lapackobjs2c="$lapackobjs2c claqp2rk claqp3rk - claqz0 - claqz1 - claqz2 - claqz3 - clatrs3 ctrsyl3 " +# claqz0 +# claqz1 +# claqz2 +# claqz3 +# clatrs3 + lapackobjs2d="$lapackobjs2d dgelqs dgelst @@ -885,15 +886,16 @@ lapackobjs2d="$lapackobjs2d dgeqrs dlaqp2rk dlaqp3rk - dlaqz0 - dlaqz1 - dlaqz2 - dlaqz3 - dlaqz4 dlarmm dlatrs3 dtrsyl3 " +# dlaqz0 +# dlaqz1 +# dlaqz2 +# dlaqz3 +# dlaqz4 + lapackobjs2z="$lapackobjs2z zgelqs zgelst @@ -901,14 +903,14 @@ lapackobjs2z="$lapackobjs2z zgeqrs zlaqp2rk zlaqp3rk - zlaqz0 - zlaqz1 - zlaqz2 - zlaqz3 zlatrs3 zrscl ztrsyl3 -" + " +# zlaqz0 +# zlaqz1 +# zlaqz2 +# zlaqz3 lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx From ff1523163f71e1ad9f1e4a2c8548416f213e7cb1 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 9 Feb 2024 12:59:14 +0000 Subject: [PATCH 191/191] Fix axpy test hangs when n==0. Reenable zaxpy_vector kernel for C910V. --- kernel/riscv64/KERNEL.C910V | 4 ++-- kernel/riscv64/axpy_vector.c | 2 +- kernel/riscv64/zaxpy_vector.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 066329390..2798a870e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c SAXPYKERNEL = axpy_vector.c DAXPYKERNEL = axpy_vector.c -CAXPYKERNEL = zaxpy.c -ZAXPYKERNEL = zaxpy.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c SAXPBYKERNEL = axpby_vector.c DAXPBYKERNEL = axpby_vector.c diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index e99ca8542..6dffe5f09 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T vy0, vy1; BLASLONG stride_x, stride_y; - if (n < 0) return(0); + if (n <= 0) return(0); if (da == 0.0) return(0); if (inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index d19e51118..1e766c5f4 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -45,10 +45,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - BLASLONG ix = 0,iy = 0; - if(n < 0) return(0); - if(da_r == 0.0 && da_i == 0.0) return(0); + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + if(n <= 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); unsigned int gvl = 0; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);